simputation packagemice packagelibrary(mice) # for visualisation & pmm
library(VIM) # visualisation & several hotdeck imputation methods
#(fast implementation)
library(simputation) # common interface for several imputation methods
library(magrittr) # for pipe operator
library(naniar) # additional package for practical
library(visdat) # visualisation of datasets
library(UpSetR) # Upset plots
library(ggplot2) # plots
nhanes dataage: Age group (1=20-39, 2=40-59, 3=60+)
bmi: Body mass index (kg/m**2)
hyp: High blood pressure, hypertension (1=no,2=yes)
chl: Total serum cholesterol (mg/dL)
head(nhanes, n = 10)
## age bmi hyp chl
## 1 1 NA NA NA
## 2 2 22.7 1 187
## 3 1 NA 1 187
## 4 3 NA NA NA
## 5 1 20.4 1 113
## 6 3 NA NA 184
## 7 1 22.5 1 118
## 8 1 30.1 1 187
## 9 2 22.0 1 238
## 10 2 NA NA NA
VIM::aggr(nhanes, col=c('blue','red'), numbers = TRUE, sortVars = TRUE,
labels = names(nhanes), cex.axis = .8, gap = 3,
ylab = c("Histogram of missing data", "Red is Missing Data"))
##
## Variables sorted by number of missings:
## Variable Count
## chl 0.40
## bmi 0.36
## hyp 0.32
## age 0.00
pattern <- mice::md.pattern(nhanes)
pattern
## age hyp bmi chl
## 13 1 1 1 1 0
## 3 1 1 1 0 1
## 1 1 1 0 1 1
## 1 1 0 0 1 2
## 7 1 0 0 0 3
## 0 8 9 10 27
naniar::gg_miss_upset(nhanes)
simputationCommon structure for specifying an imputation model:
impute_<model>(data, formula, [model-specific options])
Structure of the model in formula:
IMPUTED ~ MODEL_SPECIFICATION [ | GROUPING ]
imp_1 <- impute_lm(nhanes, bmi ~ age + hyp + chl)
head(imp_1)
## age bmi hyp chl
## 1 1 NA NA NA
## 2 2 22.70000 1 187
## 3 1 29.63182 1 187
## 4 3 NA NA NA
## 5 1 20.40000 1 113
## 6 3 NA NA 184
head(nhanes)
## age bmi hyp chl
## 1 1 NA NA NA
## 2 2 22.7 1 187
## 3 1 NA 1 187
## 4 3 NA NA NA
## 5 1 20.4 1 113
## 6 3 NA NA 184
imp_2 <- simputation::impute_median(imp_1, bmi ~ age)
head(imp_2)
## age bmi hyp chl
## 1 1 29.61591 NA NA
## 2 2 22.70000 1 187
## 3 1 29.63182 1 187
## 4 3 25.20000 NA NA
## 5 1 20.40000 1 113
## 6 3 25.20000 NA 184
imp_2a <- nhanes %>%
simputation::impute_lm(bmi ~ age + hyp + chl) %>%
simputation::impute_median(bmi ~ age)
head(imp_2a)
## age bmi hyp chl
## 1 1 29.61591 NA NA
## 2 2 22.70000 1 187
## 3 1 29.63182 1 187
## 4 3 25.20000 NA NA
## 5 1 20.40000 1 113
## 6 3 25.20000 NA 184
impute_cart(): decision tree imputationimpute_hotdeck(): random and sequential hot deck, k-nearest neighbours imputation and predictive mean matching.impute_multivariate(): imputation based on EM-estimation of multivariate normal parameters, imputation based on iterative Random Forest estimates and stochastic imptuation based on bootstrapped EM-estimatin of multivariate normal parameters.impute_proxy(): Impute missing values by a constant, by copying another variable, computing transformations from other variables.Default imputation
imp <- mice(nhanes, print = FALSE)
Defaults:
stripplot(imp, chl, pch = 19, xlab = "Imputation number")
Just 4 imputed datasets instead of the default (5)
imp <- mice(nhanes, m = 4, print = F)
imp$pred
## age bmi hyp chl
## age 0 1 1 1
## bmi 1 0 1 1
## hyp 1 1 0 1
## chl 1 1 1 0
1 = column used to impute row
ini <- mice(nhanes, maxit = 0, print = F)
pred <- ini$pred
pred
## age bmi hyp chl
## age 0 1 1 1
## bmi 1 0 1 1
## hyp 1 1 0 1
## chl 1 1 1 0
We do not use hyp to impute any other variables
pred[ ,"hyp"] <- 0
pred
## age bmi hyp chl
## age 0 1 0 1
## bmi 1 0 0 1
## hyp 1 1 0 1
## chl 1 1 0 0
imp <- mice(nhanes, m = 4, init = pred, print = F)
imp$meth
## age bmi hyp chl
## "" "pmm" "pmm" "pmm"
str(nhanes)
## 'data.frame': 25 obs. of 4 variables:
## $ age: num 1 2 1 3 1 3 1 1 2 2 ...
## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ...
## $ hyp: num NA 1 1 NA 1 NA 1 1 1 NA ...
## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...
str(nhanes2)
## 'data.frame': 25 obs. of 4 variables:
## $ age: Factor w/ 3 levels "20-39","40-59",..: 1 2 1 3 1 3 1 1 2 2 ...
## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ...
## $ hyp: Factor w/ 2 levels "no","yes": NA 1 1 NA 1 NA 1 1 1 NA ...
## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...
imp <- mice(nhanes2, m = 4, init = pred, print = F, seed = 101)
imp$meth
## age bmi hyp chl
## "" "pmm" "logreg" "pmm"
methods(mice)
## [1] mice.impute.2l.bin mice.impute.2l.lmer
## [3] mice.impute.2l.norm mice.impute.2l.pan
## [5] mice.impute.2lonly.mean mice.impute.2lonly.norm
## [7] mice.impute.2lonly.pmm mice.impute.cart
## [9] mice.impute.jomoImpute mice.impute.lda
## [11] mice.impute.logreg mice.impute.logreg.boot
## [13] mice.impute.mean mice.impute.midastouch
## [15] mice.impute.norm mice.impute.norm.boot
## [17] mice.impute.norm.nob mice.impute.norm.predict
## [19] mice.impute.panImpute mice.impute.passive
## [21] mice.impute.pmm mice.impute.polr
## [23] mice.impute.polyreg mice.impute.quadratic
## [25] mice.impute.rf mice.impute.ri
## [27] mice.impute.sample mice.mids
## [29] mice.theme
## see '?methods' for accessing help and source code
The 3rd imputed dataset
c3 <- complete(imp, 3)
md.pattern(c3, plot = FALSE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## age bmi hyp chl
## 25 1 1 1 1 0
## 0 0 0 0 0
All imputed datasets in long format
complete(imp, "long")
## .imp .id age bmi hyp chl
## 1 1 1 20-39 30.1 no 238
## 2 1 2 40-59 22.7 no 187
## 3 1 3 20-39 27.2 no 187
## 4 1 4 60-99 22.0 no 184
## 5 1 5 20-39 20.4 no 113
## 6 1 6 60-99 27.5 yes 184
## 7 1 7 20-39 22.5 no 118
## 8 1 8 20-39 30.1 no 187
## 9 1 9 40-59 22.0 no 238
## 10 1 10 40-59 22.5 yes 187
## 11 1 11 20-39 33.2 no 238
## 12 1 12 40-59 20.4 no 229
## 13 1 13 60-99 21.7 no 206
## 14 1 14 40-59 28.7 yes 204
## 15 1 15 20-39 29.6 no 187
## 16 1 16 20-39 27.2 no 118
## 17 1 17 60-99 27.2 yes 284
## 18 1 18 40-59 26.3 yes 199
## 19 1 19 20-39 35.3 no 218
## 20 1 20 60-99 25.5 yes 199
## 21 1 21 20-39 30.1 no 131
## 22 1 22 20-39 33.2 no 229
## 23 1 23 20-39 27.5 no 131
## 24 1 24 60-99 24.9 no 184
## 25 1 25 40-59 27.4 no 186
## 26 2 1 20-39 33.2 yes 229
## 27 2 2 40-59 22.7 no 187
## 28 2 3 20-39 22.0 no 187
## 29 2 4 60-99 30.1 no 199
## 30 2 5 20-39 20.4 no 113
## 31 2 6 60-99 26.3 no 184
## 32 2 7 20-39 22.5 no 118
## 33 2 8 20-39 30.1 no 187
## 34 2 9 40-59 22.0 no 238
## 35 2 10 40-59 35.3 yes 218
## 36 2 11 20-39 33.2 yes 187
## 37 2 12 40-59 20.4 yes 186
## 38 2 13 60-99 21.7 no 206
## 39 2 14 40-59 28.7 yes 204
## 40 2 15 20-39 29.6 no 187
## 41 2 16 20-39 22.5 no 238
## 42 2 17 60-99 27.2 yes 284
## 43 2 18 40-59 26.3 yes 199
## 44 2 19 20-39 35.3 no 218
## 45 2 20 60-99 25.5 yes 186
## 46 2 21 20-39 29.6 no 187
## 47 2 22 20-39 33.2 no 229
## 48 2 23 20-39 27.5 no 131
## 49 2 24 60-99 24.9 no 218
## 50 2 25 40-59 27.4 no 186
## 51 3 1 20-39 35.3 yes 199
## 52 3 2 40-59 22.7 no 187
## 53 3 3 20-39 28.7 no 187
## 54 3 4 60-99 28.7 no 184
## 55 3 5 20-39 20.4 no 113
## 56 3 6 60-99 27.4 no 184
## 57 3 7 20-39 22.5 no 118
## 58 3 8 20-39 30.1 no 187
## 59 3 9 40-59 22.0 no 238
## 60 3 10 40-59 27.4 no 187
## 61 3 11 20-39 30.1 no 131
## 62 3 12 40-59 22.5 no 131
## 63 3 13 60-99 21.7 no 206
## 64 3 14 40-59 28.7 yes 204
## 65 3 15 20-39 29.6 no 187
## 66 3 16 20-39 30.1 yes 199
## 67 3 17 60-99 27.2 yes 284
## 68 3 18 40-59 26.3 yes 199
## 69 3 19 20-39 35.3 no 218
## 70 3 20 60-99 25.5 yes 184
## 71 3 21 20-39 22.0 no 118
## 72 3 22 20-39 33.2 no 229
## 73 3 23 20-39 27.5 no 131
## 74 3 24 60-99 24.9 no 206
## 75 3 25 40-59 27.4 no 186
## 76 4 1 20-39 33.2 no 218
## 77 4 2 40-59 22.7 no 187
## 78 4 3 20-39 29.6 no 187
## 79 4 4 60-99 20.4 no 187
## 80 4 5 20-39 20.4 no 113
## 81 4 6 60-99 22.7 no 184
## 82 4 7 20-39 22.5 no 118
## 83 4 8 20-39 30.1 no 187
## 84 4 9 40-59 22.0 no 238
## 85 4 10 40-59 28.7 no 184
## 86 4 11 20-39 28.7 no 187
## 87 4 12 40-59 26.3 no 184
## 88 4 13 60-99 21.7 no 206
## 89 4 14 40-59 28.7 yes 204
## 90 4 15 20-39 29.6 no 187
## 91 4 16 20-39 30.1 no 199
## 92 4 17 60-99 27.2 yes 284
## 93 4 18 40-59 26.3 yes 199
## 94 4 19 20-39 35.3 no 218
## 95 4 20 60-99 25.5 yes 186
## 96 4 21 20-39 33.2 no 199
## 97 4 22 20-39 33.2 no 229
## 98 4 23 20-39 27.5 no 131
## 99 4 24 60-99 24.9 no 199
## 100 4 25 40-59 27.4 no 186
fit <- with(imp, lm(bmi ~ chl))
fit
## call :
## with.mids(data = imp, expr = lm(bmi ~ chl))
##
## call1 :
## mice(data = nhanes2, m = 4, printFlag = F, seed = 101, init = pred)
##
## nmis :
## age bmi hyp chl
## 0 9 8 10
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = bmi ~ chl)
##
## Coefficients:
## (Intercept) chl
## 22.15053 0.02338
##
##
## [[2]]
##
## Call:
## lm(formula = bmi ~ chl)
##
## Coefficients:
## (Intercept) chl
## 19.9017 0.0367
##
##
## [[3]]
##
## Call:
## lm(formula = bmi ~ chl)
##
## Coefficients:
## (Intercept) chl
## 20.2657 0.0371
##
##
## [[4]]
##
## Call:
## lm(formula = bmi ~ chl)
##
## Coefficients:
## (Intercept) chl
## 19.04484 0.04203
summary(fit$analyses[[2]])
##
## Call:
## lm(formula = bmi ~ chl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6367 -3.6490 -0.3548 2.8947 7.3974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.90171 4.98265 3.994 0.000571 ***
## chl 0.03670 0.02496 1.470 0.155057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.549 on 23 degrees of freedom
## Multiple R-squared: 0.0859, Adjusted R-squared: 0.04616
## F-statistic: 2.161 on 1 and 23 DF, p-value: 0.1551
pool.fit <- pool(fit)
summary(pool.fit)
## estimate std.error statistic df p.value
## (Intercept) 20.34068248 4.54833393 4.472117 17.81040 0.0003016604
## chl 0.03480344 0.02379386 1.462707 16.27191 0.1625944773
miHmiscAmeliamixnaniar::gg_miss_fct(x = nhanes, fct = age)
visdat::vis_miss(nhanes)
plot(imp)
imp <- mice(nhanes, m = 4, init = pred, print = F, seed = 101)