Statistical Programming with R
simputation packagemice packagelibrary(mice) # for visualisation & pmm library(VIM) # visualisation & several hotdeck imputation methods #(fast implementation) library(simputation) # common interface for several imputation methods library(magrittr) # for pipe operator library(naniar) # additional package for practical library(visdat) # visualisation of datasets library(UpSetR) # Upset plots library(ggplot2) # plots
nhanes dataage: Age group (1=20-39, 2=40-59, 3=60+)
bmi: Body mass index (kg/m**2)
hyp: High blood pressure, hypertension (1=no,2=yes)
chl: Total serum cholesterol (mg/dL)
head(nhanes, n = 10)
## age bmi hyp chl ## 1 1 NA NA NA ## 2 2 22.7 1 187 ## 3 1 NA 1 187 ## 4 3 NA NA NA ## 5 1 20.4 1 113 ## 6 3 NA NA 184 ## 7 1 22.5 1 118 ## 8 1 30.1 1 187 ## 9 2 22.0 1 238 ## 10 2 NA NA NA
VIM::aggr(nhanes, col=c('blue','red'), numbers = TRUE, sortVars = TRUE,
labels = names(nhanes), cex.axis = .8, gap = 3,
ylab = c("Histogram of missing data", "Red is Missing Data"))
## ## Variables sorted by number of missings: ## Variable Count ## chl 0.40 ## bmi 0.36 ## hyp 0.32 ## age 0.00
pattern <- mice::md.pattern(nhanes)
pattern
## age hyp bmi chl ## 13 1 1 1 1 0 ## 3 1 1 1 0 1 ## 1 1 1 0 1 1 ## 1 1 0 0 1 2 ## 7 1 0 0 0 3 ## 0 8 9 10 27
naniar::gg_miss_upset(nhanes)
simputationCommon structure for specifying an imputation model:
impute_<model>(data, formula, [model-specific options])
Structure of the model in formula:
IMPUTED ~ MODEL_SPECIFICATION [ | GROUPING ]
imp_1 <- impute_lm(nhanes, bmi ~ age + hyp + chl) head(imp_1)
## age bmi hyp chl ## 1 1 NA NA NA ## 2 2 22.70000 1 187 ## 3 1 29.63182 1 187 ## 4 3 NA NA NA ## 5 1 20.40000 1 113 ## 6 3 NA NA 184
head(nhanes)
## age bmi hyp chl ## 1 1 NA NA NA ## 2 2 22.7 1 187 ## 3 1 NA 1 187 ## 4 3 NA NA NA ## 5 1 20.4 1 113 ## 6 3 NA NA 184
imp_2 <- simputation::impute_median(imp_1, bmi ~ age) head(imp_2)
## age bmi hyp chl ## 1 1 29.61591 NA NA ## 2 2 22.70000 1 187 ## 3 1 29.63182 1 187 ## 4 3 25.20000 NA NA ## 5 1 20.40000 1 113 ## 6 3 25.20000 NA 184
imp_2a <- nhanes %>%
simputation::impute_lm(bmi ~ age + hyp + chl) %>%
simputation::impute_median(bmi ~ age)
head(imp_2a)
## age bmi hyp chl ## 1 1 29.61591 NA NA ## 2 2 22.70000 1 187 ## 3 1 29.63182 1 187 ## 4 3 25.20000 NA NA ## 5 1 20.40000 1 113 ## 6 3 25.20000 NA 184
impute_cart(): decision tree imputationimpute_hotdeck(): random and sequential hot deck, k-nearest neighbours imputation and predictive mean matching.impute_multivariate(): imputation based on EM-estimation of multivariate normal parameters, imputation based on iterative Random Forest estimates and stochastic imptuation based on bootstrapped EM-estimatin of multivariate normal parameters.impute_proxy(): Impute missing values by a constant, by copying another variable, computing transformations from other variables.Default imputation
imp <- mice(nhanes, print = FALSE)
Defaults:
stripplot(imp, chl, pch = 19, xlab = "Imputation number")
Just 4 imputed datasets instead of the default (5)
imp <- mice(nhanes, m = 4, print = F)
imp$pred
## age bmi hyp chl ## age 0 1 1 1 ## bmi 1 0 1 1 ## hyp 1 1 0 1 ## chl 1 1 1 0
1 = column used to impute row
ini <- mice(nhanes, maxit = 0, print = F) pred <- ini$pred pred
## age bmi hyp chl ## age 0 1 1 1 ## bmi 1 0 1 1 ## hyp 1 1 0 1 ## chl 1 1 1 0
We do not use hyp to impute any other variables
pred[ ,"hyp"] <- 0 pred
## age bmi hyp chl ## age 0 1 0 1 ## bmi 1 0 0 1 ## hyp 1 1 0 1 ## chl 1 1 0 0
imp <- mice(nhanes, m = 4, init = pred, print = F)
imp$meth
## age bmi hyp chl ## "" "pmm" "pmm" "pmm"
str(nhanes)
## 'data.frame': 25 obs. of 4 variables: ## $ age: num 1 2 1 3 1 3 1 1 2 2 ... ## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ... ## $ hyp: num NA 1 1 NA 1 NA 1 1 1 NA ... ## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...
str(nhanes2)
## 'data.frame': 25 obs. of 4 variables: ## $ age: Factor w/ 3 levels "20-39","40-59",..: 1 2 1 3 1 3 1 1 2 2 ... ## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ... ## $ hyp: Factor w/ 2 levels "no","yes": NA 1 1 NA 1 NA 1 1 1 NA ... ## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...
imp <- mice(nhanes2, m = 4, init = pred, print = F, seed = 101) imp$meth
## age bmi hyp chl ## "" "pmm" "logreg" "pmm"
methods(mice)
## [1] mice.impute.2l.bin mice.impute.2l.lmer ## [3] mice.impute.2l.norm mice.impute.2l.pan ## [5] mice.impute.2lonly.mean mice.impute.2lonly.norm ## [7] mice.impute.2lonly.pmm mice.impute.cart ## [9] mice.impute.jomoImpute mice.impute.lda ## [11] mice.impute.logreg mice.impute.logreg.boot ## [13] mice.impute.mean mice.impute.midastouch ## [15] mice.impute.norm mice.impute.norm.boot ## [17] mice.impute.norm.nob mice.impute.norm.predict ## [19] mice.impute.panImpute mice.impute.passive ## [21] mice.impute.pmm mice.impute.polr ## [23] mice.impute.polyreg mice.impute.quadratic ## [25] mice.impute.rf mice.impute.ri ## [27] mice.impute.sample mice.mids ## [29] mice.theme ## see '?methods' for accessing help and source code
The 3rd imputed dataset
c3 <- complete(imp, 3) md.pattern(c3, plot = FALSE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## age bmi hyp chl ## 25 1 1 1 1 0 ## 0 0 0 0 0
All imputed datasets in long format
complete(imp, "long")
## .imp .id age bmi hyp chl ## 1 1 1 20-39 30.1 no 238 ## 2 1 2 40-59 22.7 no 187 ## 3 1 3 20-39 27.2 no 187 ## 4 1 4 60-99 22.0 no 184 ## 5 1 5 20-39 20.4 no 113 ## 6 1 6 60-99 27.5 yes 184 ## 7 1 7 20-39 22.5 no 118 ## 8 1 8 20-39 30.1 no 187 ## 9 1 9 40-59 22.0 no 238 ## 10 1 10 40-59 22.5 yes 187 ## 11 1 11 20-39 33.2 no 238 ## 12 1 12 40-59 20.4 no 229 ## 13 1 13 60-99 21.7 no 206 ## 14 1 14 40-59 28.7 yes 204 ## 15 1 15 20-39 29.6 no 187 ## 16 1 16 20-39 27.2 no 118 ## 17 1 17 60-99 27.2 yes 284 ## 18 1 18 40-59 26.3 yes 199 ## 19 1 19 20-39 35.3 no 218 ## 20 1 20 60-99 25.5 yes 199 ## 21 1 21 20-39 30.1 no 131 ## 22 1 22 20-39 33.2 no 229 ## 23 1 23 20-39 27.5 no 131 ## 24 1 24 60-99 24.9 no 184 ## 25 1 25 40-59 27.4 no 186 ## 26 2 1 20-39 33.2 yes 229 ## 27 2 2 40-59 22.7 no 187 ## 28 2 3 20-39 22.0 no 187 ## 29 2 4 60-99 30.1 no 199 ## 30 2 5 20-39 20.4 no 113 ## 31 2 6 60-99 26.3 no 184 ## 32 2 7 20-39 22.5 no 118 ## 33 2 8 20-39 30.1 no 187 ## 34 2 9 40-59 22.0 no 238 ## 35 2 10 40-59 35.3 yes 218 ## 36 2 11 20-39 33.2 yes 187 ## 37 2 12 40-59 20.4 yes 186 ## 38 2 13 60-99 21.7 no 206 ## 39 2 14 40-59 28.7 yes 204 ## 40 2 15 20-39 29.6 no 187 ## 41 2 16 20-39 22.5 no 238 ## 42 2 17 60-99 27.2 yes 284 ## 43 2 18 40-59 26.3 yes 199 ## 44 2 19 20-39 35.3 no 218 ## 45 2 20 60-99 25.5 yes 186 ## 46 2 21 20-39 29.6 no 187 ## 47 2 22 20-39 33.2 no 229 ## 48 2 23 20-39 27.5 no 131 ## 49 2 24 60-99 24.9 no 218 ## 50 2 25 40-59 27.4 no 186 ## 51 3 1 20-39 35.3 yes 199 ## 52 3 2 40-59 22.7 no 187 ## 53 3 3 20-39 28.7 no 187 ## 54 3 4 60-99 28.7 no 184 ## 55 3 5 20-39 20.4 no 113 ## 56 3 6 60-99 27.4 no 184 ## 57 3 7 20-39 22.5 no 118 ## 58 3 8 20-39 30.1 no 187 ## 59 3 9 40-59 22.0 no 238 ## 60 3 10 40-59 27.4 no 187 ## 61 3 11 20-39 30.1 no 131 ## 62 3 12 40-59 22.5 no 131 ## 63 3 13 60-99 21.7 no 206 ## 64 3 14 40-59 28.7 yes 204 ## 65 3 15 20-39 29.6 no 187 ## 66 3 16 20-39 30.1 yes 199 ## 67 3 17 60-99 27.2 yes 284 ## 68 3 18 40-59 26.3 yes 199 ## 69 3 19 20-39 35.3 no 218 ## 70 3 20 60-99 25.5 yes 184 ## 71 3 21 20-39 22.0 no 118 ## 72 3 22 20-39 33.2 no 229 ## 73 3 23 20-39 27.5 no 131 ## 74 3 24 60-99 24.9 no 206 ## 75 3 25 40-59 27.4 no 186 ## 76 4 1 20-39 33.2 no 218 ## 77 4 2 40-59 22.7 no 187 ## 78 4 3 20-39 29.6 no 187 ## 79 4 4 60-99 20.4 no 187 ## 80 4 5 20-39 20.4 no 113 ## 81 4 6 60-99 22.7 no 184 ## 82 4 7 20-39 22.5 no 118 ## 83 4 8 20-39 30.1 no 187 ## 84 4 9 40-59 22.0 no 238 ## 85 4 10 40-59 28.7 no 184 ## 86 4 11 20-39 28.7 no 187 ## 87 4 12 40-59 26.3 no 184 ## 88 4 13 60-99 21.7 no 206 ## 89 4 14 40-59 28.7 yes 204 ## 90 4 15 20-39 29.6 no 187 ## 91 4 16 20-39 30.1 no 199 ## 92 4 17 60-99 27.2 yes 284 ## 93 4 18 40-59 26.3 yes 199 ## 94 4 19 20-39 35.3 no 218 ## 95 4 20 60-99 25.5 yes 186 ## 96 4 21 20-39 33.2 no 199 ## 97 4 22 20-39 33.2 no 229 ## 98 4 23 20-39 27.5 no 131 ## 99 4 24 60-99 24.9 no 199 ## 100 4 25 40-59 27.4 no 186
fit <- with(imp, lm(bmi ~ chl)) fit
## call : ## with.mids(data = imp, expr = lm(bmi ~ chl)) ## ## call1 : ## mice(data = nhanes2, m = 4, printFlag = F, seed = 101, init = pred) ## ## nmis : ## age bmi hyp chl ## 0 9 8 10 ## ## analyses : ## [[1]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 22.15053 0.02338 ## ## ## [[2]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 19.9017 0.0367 ## ## ## [[3]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 20.2657 0.0371 ## ## ## [[4]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 19.04484 0.04203
summary(fit$analyses[[2]])
## ## Call: ## lm(formula = bmi ~ chl) ## ## Residuals: ## Min 1Q Median 3Q Max ## -6.6367 -3.6490 -0.3548 2.8947 7.3974 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 19.90171 4.98265 3.994 0.000571 *** ## chl 0.03670 0.02496 1.470 0.155057 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 4.549 on 23 degrees of freedom ## Multiple R-squared: 0.0859, Adjusted R-squared: 0.04616 ## F-statistic: 2.161 on 1 and 23 DF, p-value: 0.1551
pool.fit <- pool(fit) summary(pool.fit)
## estimate std.error statistic df p.value ## (Intercept) 20.34068248 4.54833393 4.472117 17.81040 0.0003016604 ## chl 0.03480344 0.02379386 1.462707 16.27191 0.1625944773
miHmiscAmeliamixnaniar::gg_miss_fct(x = nhanes, fct = age)
visdat::vis_miss(nhanes)
plot(imp)
imp <- mice(nhanes, m = 4, init = pred, print = F, seed = 101)