Zero-inflation
Various ways in which zero-inflation can be detected and handled.
Used packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(scales)
library(pscl)
## Warning: package 'pscl' was built under R version 4.0.2
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(MASS)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
Lots of zero’s doesn’t imply zero-inflation
set.seed(1234)
n <- 1e4
n.sim <- 1e3
mean.poisson <- 0.05
mean.zeroinfl <- 10000
prop.zeroinfl <- 0.1
dataset <- data.frame(
Poisson = rpois(n, lambda = mean.poisson)
)
ggplot() +
geom_linerange(
data = dataset %>%
filter(Poisson == 0) %>%
count(Poisson),
mapping = aes(
x = Poisson,
ymin = 0,
ymax = n
)
) +
geom_histogram(
data = dataset %>%
filter(Poisson > 0),
mapping = aes(x = Poisson),
boundary = 0
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(dataset$Poisson)
##
## 0 1 2
## 9533 462 5
The example above generates values from a Poisson distribution with mean 0.05. Although it has no zero-inflation, 95.3% of the values are zero.
Zero-inflation can also occur with low number of zero’s
dataset$ZeroInflatedPoisson <- rbinom(n, size = 1, prob = 1 - prop.zeroinfl) *
rpois(n, lambda = mean.zeroinfl)
ggplot() +
geom_linerange(
data = dataset %>%
filter(ZeroInflatedPoisson == 0) %>%
count(ZeroInflatedPoisson),
mapping = aes(
x = ZeroInflatedPoisson,
ymin = 0,
ymax = n
)
) +
geom_histogram(
data = dataset %>%
filter(ZeroInflatedPoisson > 0),
mapping = aes(x = ZeroInflatedPoisson),
boundary = 0
)
table(dataset$ZeroInflatedPoisson == 0)
##
## FALSE TRUE
## 9007 993
The second example generates a data from a zero-inflated Poisson distribution with mean 10^{4} and 10, % excess zero’s. The actual proportion of zero’s is 9.9%.
How to test for zero-inflation
dataset <- expand.grid(
Mean = c(mean.poisson, mean.zeroinfl),
Rep = seq_len(n)
)
dataset$Poisson <- rpois(nrow(dataset), lambda = dataset$Mean)
dataset$ZeroInflatedPoisson <- rbinom(nrow(dataset), size = 1, prob = 1 - prop.zeroinfl) *
rpois(nrow(dataset), lambda = dataset$Mean)
ggplot() +
geom_linerange(
data = dataset %>%
filter(Poisson == 0) %>%
count(Poisson),
mapping = aes(
x = Poisson,
ymin = 0,
ymax = n
)
) +
geom_histogram(
data = dataset %>%
filter(Poisson > 0),
mapping = aes(x = Poisson),
boundary = 0
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() +
geom_linerange(
data = dataset %>%
filter(ZeroInflatedPoisson == 0) %>%
count(ZeroInflatedPoisson),
mapping = aes(
x = ZeroInflatedPoisson,
ymin = 0,
ymax = n
)
) +
geom_histogram(
data = dataset %>%
filter(ZeroInflatedPoisson > 0),
mapping = aes(x = ZeroInflatedPoisson),
boundary = 0
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For this example we generate a new dataset with two levels of Mean: 0.05, 10000.00. We generate both a Poisson and a zero-inflated Poisson variable. The latter has 10, % excess zero’s. The proportion of zero’s the variables are respectively 47.3% and 52.7%.
Let’s assume that our hypothesis is that all observations share the same mean. Note that is clearly not the case. If this hypothesis would hold, then we fit a simple intercept-only model.
m.pois.simple <- glm(Poisson ~ 1, data = dataset, family = poisson)
m.zero.simple <- glm(ZeroInflatedPoisson ~ 1, data = dataset, family = poisson)
summary(m.pois.simple)
##
## Call:
## glm(formula = Poisson ~ 1, family = poisson, data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -100.00 -100.00 -20.85 62.15 66.33
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.5172 0.0001 85172 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 138627905 on 19999 degrees of freedom
## Residual deviance: 138627905 on 19999 degrees of freedom
## AIC: 138739486
##
## Number of Fisher Scoring iterations: 6
summary(m.zero.simple)
##
## Call:
## glm(formula = ZeroInflatedPoisson ~ 1, family = poisson, data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -94.96 -94.96 -94.96 70.19 74.27
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.4137957 0.0001053 79899 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 143653204 on 19999 degrees of freedom
## Residual deviance: 143653204 on 19999 degrees of freedom
## AIC: 143753725
##
## Number of Fisher Scoring iterations: 6
After fitting the model we generate at random a new response variable based on the models and count the number of zero’s. This is repeated several times so that we can get a distribion of zero’s based on the model. Finally we compare the number of zero’s in the original dataset with this distribution.
simulated <- data.frame(
PoissonSimple = apply(
simulate(m.pois.simple, nsim = n.sim) == 0,
2,
mean
),
ZeroInflatedSimple = apply(
simulate(m.zero.simple, nsim = n.sim) == 0,
2,
mean
)
)
ggplot(simulated, aes(x = PoissonSimple)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset$Poisson == 0), colour = "red") +
scale_x_continuous(label = percent)
ggplot(simulated, aes(x = ZeroInflatedSimple)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset$Poisson == 0), colour = "red") +
scale_x_continuous(label = percent)
In both cases none of the simulated values contains zero’s. The red line indicates the observed proportion of zero’s with is clearly greater. This indicates that we have either a poor model fit or zero-inflation.
Improving the model
In the case of this example we have two groups with very different means: one group with low mean generating lots of zero’s and one group with large mean generating no zero’s. Adding this grouping improves the model.
m.pois.complex <- glm(Poisson ~ factor(Mean), data = dataset, family = poisson)
m.zero.complex <- glm(ZeroInflatedPoisson ~ factor(Mean), data = dataset, family = poisson)
summary(m.pois.complex)
##
## Call:
## glm(formula = Poisson ~ factor(Mean), family = poisson, data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6720 -0.3353 -0.3353 0.1405 4.2399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.87884 0.04218 -68.25 <2e-16 ***
## factor(Mean)10000 12.08917 0.04218 286.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 138627905 on 19999 degrees of freedom
## Residual deviance: 13118 on 19998 degrees of freedom
## AIC: 124702
##
## Number of Fisher Scoring iterations: 6
summary(m.zero.complex)
##
## Call:
## glm(formula = ZeroInflatedPoisson ~ factor(Mean), family = poisson,
## data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -134.295 -0.301 -0.301 10.023 13.695
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.09224 0.04691 -65.93 <2e-16 ***
## factor(Mean)10000 12.19918 0.04691 260.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 143653204 on 19999 degrees of freedom
## Residual deviance: 18653539 on 19998 degrees of freedom
## AIC: 18754062
##
## Number of Fisher Scoring iterations: 5
simulated$PoissonComplex <- apply(
simulate(m.pois.complex, nsim = n.sim) == 0,
2,
mean
)
simulated$ZeroInflatedComplex <- apply(
simulate(m.zero.complex, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = PoissonComplex)) +
geom_histogram(binwidth = 0.0005) +
geom_vline(xintercept = mean(dataset$Poisson == 0), colour = "red") +
scale_x_continuous(label = percent)
ggplot(simulated, aes(x = ZeroInflatedComplex)) +
geom_histogram(binwidth = 0.0005) +
geom_vline(xintercept = mean(dataset$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
In case of the Poisson variable, 45.1% of the simulated dataset from the complex model has more zero’s than the observed dataset. So this model can captures the zero’s and there is not zero-inflation. In case of the zero-inflated Poisson variable 0.0% of the simulated dataset from the complex model has more zero’s than the observed dataset. Hence we conclude that the zero’s are not modelled properly. We can’t improve the model further with covariates, hence we’ll have to treat is a a zero-inflated distribution.
m.zero.zi <- zeroinfl(
ZeroInflatedPoisson ~ factor(Mean) | 1,
data = dataset,
dist = "poisson"
)
summary(m.zero.zi)
##
## Call:
## zeroinfl(formula = ZeroInflatedPoisson ~ factor(Mean) | 1, data = dataset,
## dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -3.0282 -0.2126 -0.2126 0.3297 9.1506
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.98879 0.04716 -63.37 <2e-16 ***
## factor(Mean)10000 12.19909 0.04716 258.65 <2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2170 0.0336 -65.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 13
## Log-likelihood: -5.938e+04 on 3 Df
sim.zero.zi <- function(model) {
eta <- predict(model, type = "count")
prob <- predict(model, type = "zero")
new.value <-
rbinom(length(eta), size = 1, prob = 1 - prob) *
rpois(length(eta), lambda = eta)
mean(new.value == 0)
}
simulated$ZeroInflatedZI <- replicate(n.sim, sim.zero.zi(m.zero.zi))
ggplot(simulated, aes(x = ZeroInflatedZI)) +
geom_histogram(binwidth = 0.0005) +
geom_vline(xintercept = mean(dataset$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
When we fit a proper zero-inflated model to the zero-inflated Poisson variable, then 45.5% of the simulated datasets have more zero’s than the observed dataset. So the zero-inflation is properly handled.
Modelling zero-inflation by overdispersion
m.zero.nb <- glm.nb(ZeroInflatedPoisson ~ factor(Mean), data = dataset)
summary(m.zero.nb)
##
## Call:
## glm.nb(formula = ZeroInflatedPoisson ~ factor(Mean), data = dataset,
## init.theta = 0.6535490762, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5298 -0.2963 -0.2963 0.0850 2.8393
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.09224 0.04854 -63.71 <2e-16 ***
## factor(Mean)10000 12.19918 0.05009 243.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.6535) family taken to be 1)
##
## Null deviance: 130864 on 19999 degrees of freedom
## Residual deviance: 14676 on 19998 degrees of freedom
## AIC: 204749
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.65355
## Std. Err.: 0.00868
##
## 2 x log-likelihood: -204742.76400
dataset$ID <- seq_along(dataset$Mean)
m.zero.olre <- glmer(
ZeroInflatedPoisson ~ factor(Mean) + (1 | ID),
data = dataset,
family = poisson
)
summary(m.zero.olre)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: ZeroInflatedPoisson ~ factor(Mean) + (1 | ID)
## Data: dataset
##
## AIC BIC logLik deviance df.resid
## 218344.4 218368.1 -109169.2 218338.4 19997
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.99401 -0.06651 -0.06651 0.00113 0.81490
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 8.377 2.894
## Number of obs: 20000, groups: ID, 20000
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.38382 0.06961 -77.34 <2e-16 ***
## factor(Mean)10000 13.64883 0.07490 182.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## fct(M)10000 -0.921
simulated$ZeroInflatedNB <- apply(
simulate(m.zero.nb, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = ZeroInflatedNB)) +
geom_histogram(binwidth = 0.0005) +
geom_vline(xintercept = mean(dataset$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
simulated$ZeroInflatedOLRE <- apply(
simulate(m.zero.olre, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = ZeroInflatedOLRE)) +
geom_histogram(binwidth = 0.0005) +
geom_vline(xintercept = mean(dataset$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
mean.2 <- 5
dataset2 <- expand.grid(
Mean = mean.2,
Rep = seq_len(100)
)
dataset2$ZeroInflatedPoisson <-
rbinom(nrow(dataset2), size = 1, prob = 1 - prop.zeroinfl) *
rpois(nrow(dataset2), lambda = dataset2$Mean)
ggplot(dataset2, aes(x = ZeroInflatedPoisson)) +
geom_histogram(binwidth = 1)
m.zero2 <- glm(ZeroInflatedPoisson ~ 1, data = dataset2, family = poisson)
summary(m.zero2)
##
## Call:
## glm(formula = ZeroInflatedPoisson ~ 1, family = poisson, data = dataset2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9326 -0.6633 -0.1464 0.7731 2.6952
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.45862 0.04822 30.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 217.63 on 99 degrees of freedom
## Residual deviance: 217.63 on 99 degrees of freedom
## AIC: 508.69
##
## Number of Fisher Scoring iterations: 5
simulated$ZeroInflated2 <- apply(
simulate(m.zero2, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = ZeroInflated2)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset2$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
m.zero.nb2 <- glm.nb(ZeroInflatedPoisson ~ 1, data = dataset2)
summary(m.zero.nb2)
##
## Call:
## glm.nb(formula = ZeroInflatedPoisson ~ 1, data = dataset2, init.theta = 4.249064467,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4375 -0.4811 -0.1038 0.5292 1.7364
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4586 0.0684 21.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(4.2491) family taken to be 1)
##
## Null deviance: 130.63 on 99 degrees of freedom
## Residual deviance: 130.63 on 99 degrees of freedom
## AIC: 489.84
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 4.25
## Std. Err.: 1.44
##
## 2 x log-likelihood: -485.839
simulated$ZeroInflatedNB2 <- apply(
simulate(m.zero.nb2, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = ZeroInflatedNB2)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset2$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
dataset2$ID <- seq_along(dataset2$Mean)
m.zero.olre2 <- glmer(
ZeroInflatedPoisson ~ (1 | ID),
data = dataset2,
family = poisson
)
simulated$ZeroInflatedOLRE2 <- apply(
simulate(m.zero.olre2, nsim = n.sim) == 0,
2,
mean
)
ggplot(simulated, aes(x = ZeroInflatedOLRE2)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset2$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
m.zero.zi2 <- zeroinfl(
ZeroInflatedPoisson ~ 1,
data = dataset2,
dist = "poisson"
)
summary(m.zero.zi2)
##
## Call:
## zeroinfl(formula = ZeroInflatedPoisson ~ 1, data = dataset2, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5786 -0.4772 -0.1101 0.6241 2.4596
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.61454 0.04905 32.92 <2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7794 0.2912 -6.111 9.9e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 5
## Log-likelihood: -225.2 on 2 Df
simulated$ZeroInflatedZI2 <- replicate(n.sim, sim.zero.zi(m.zero.zi2))
ggplot(simulated, aes(x = ZeroInflatedZI2)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = mean(dataset2$ZeroInflatedPoisson == 0), colour = "red") +
scale_x_continuous(label = percent)
A negative binomial distribution can capture some of the zero-inflation by overdispersion. Especially in cases with low mean and few data. When the mean is low, a reasonable portion of the zero origination comes from the poisson distribution. However, the shape of the distribution is different.
distribution <- data.frame(
Count = 0:20
)
distribution$Truth <- dpois(
distribution$Count,
lambda = mean.2
) + ifelse(
distribution$Count == 0,
prop.zeroinfl,
0
)
distribution$Poisson <- dpois(
distribution$Count,
lambda = exp(coef(m.zero2))
)
distribution$ZIPoisson <- dpois(
distribution$Count,
lambda = exp(coef(m.zero.zi2)[1])
) + ifelse(
distribution$Count == 0,
plogis(coef(m.zero.zi2)[2]),
0
)
se <- sqrt(VarCorr(m.zero.olre2)$ID)
z <- seq(qnorm(.001, sd = se), qnorm(.999, sd = se), length = 101)
dz <- dnorm(z, sd = se)
dz <- dz / sum(dz)
delta <- outer(
distribution$Count,
exp(z + fixef(m.zero.olre2)),
FUN = dpois
)
distribution$OLRE <- delta %*% dz
distribution$NegBin <- dnbinom(
distribution$Count,
size = m.zero.nb2$theta,
mu = exp(coef(m.zero.nb2))
)
long <- gather(distribution, key = "Model", value = "Density", -Count)
## Warning: attributes are not identical across measure variables;
## they will be dropped
long$Truth <- long$Model == "Truth"
ggplot(long, aes(x = Count, y = Density, colour = Model, linetype = Truth)) +
geom_line()