Imputation

library(mice) # for visualisation & pmm library(VIM) # visualisation & several hotdeck imputation methods #(fast implementation) library(simputation) # common interface for several imputation methods library(magrittr) # for pipe operator library(naniar) # additional package for practical library(visdat) # visualisation of datasets library(UpSetR) # Upset plots library(ggplot2) # plots

## age bmi hyp chl ## 1 1 NA NA NA ## 2 2 22.7 1 187 ## 3 1 NA 1 187 ## 4 3 NA NA NA ## 5 1 20.4 1 113 ## 6 3 NA NA 184 ## 7 1 22.5 1 118 ## 8 1 30.1 1 187 ## 9 2 22.0 1 238 ## 10 2 NA NA NA

VIM::aggr(nhanes, col=c('blue','red'), numbers = TRUE, sortVars = TRUE, labels = names(nhanes), cex.axis = .8, gap = 3, ylab = c("Histogram of missing data", "Red is Missing Data"))

## 'data.frame': 25 obs. of 4 variables: ## $ age: num 1 2 1 3 1 3 1 1 2 2 ... ## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ... ## $ hyp: num NA 1 1 NA 1 NA 1 1 1 NA ... ## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...

## 'data.frame': 25 obs. of 4 variables: ## $ age: Factor w/ 3 levels "20-39","40-59",..: 1 2 1 3 1 3 1 1 2 2 ... ## $ bmi: num NA 22.7 NA NA 20.4 NA 22.5 30.1 22 NA ... ## $ hyp: Factor w/ 2 levels "no","yes": NA 1 1 NA 1 NA 1 1 1 NA ... ## $ chl: num NA 187 187 NA 113 184 118 187 238 NA ...

## [1] mice.impute.2l.bin mice.impute.2l.lmer ## [3] mice.impute.2l.norm mice.impute.2l.pan ## [5] mice.impute.2lonly.mean mice.impute.2lonly.norm ## [7] mice.impute.2lonly.pmm mice.impute.cart ## [9] mice.impute.jomoImpute mice.impute.lda ## [11] mice.impute.logreg mice.impute.logreg.boot ## [13] mice.impute.mean mice.impute.midastouch ## [15] mice.impute.norm mice.impute.norm.boot ## [17] mice.impute.norm.nob mice.impute.norm.predict ## [19] mice.impute.panImpute mice.impute.passive ## [21] mice.impute.pmm mice.impute.polr ## [23] mice.impute.polyreg mice.impute.quadratic ## [25] mice.impute.rf mice.impute.ri ## [27] mice.impute.sample mice.mids ## [29] mice.theme ## see '?methods' for accessing help and source code

All imputed datasets in long format

complete(imp, "long")

##     .imp .id   age  bmi hyp chl
## 1      1   1 20-39 30.1  no 238
## 2      1   2 40-59 22.7  no 187
## 3      1   3 20-39 27.2  no 187
## 4      1   4 60-99 22.0  no 184
## 5      1   5 20-39 20.4  no 113
## 6      1   6 60-99 27.5 yes 184
## 7      1   7 20-39 22.5  no 118
## 8      1   8 20-39 30.1  no 187
## 9      1   9 40-59 22.0  no 238
## 10     1  10 40-59 22.5 yes 187
## 11     1  11 20-39 33.2  no 238
## 12     1  12 40-59 20.4  no 229
## 13     1  13 60-99 21.7  no 206
## 14     1  14 40-59 28.7 yes 204
## 15     1  15 20-39 29.6  no 187
## 16     1  16 20-39 27.2  no 118
## 17     1  17 60-99 27.2 yes 284
## 18     1  18 40-59 26.3 yes 199
## 19     1  19 20-39 35.3  no 218
## 20     1  20 60-99 25.5 yes 199
## 21     1  21 20-39 30.1  no 131
## 22     1  22 20-39 33.2  no 229
## 23     1  23 20-39 27.5  no 131
## 24     1  24 60-99 24.9  no 184
## 25     1  25 40-59 27.4  no 186
## 26     2   1 20-39 33.2 yes 229
## 27     2   2 40-59 22.7  no 187
## 28     2   3 20-39 22.0  no 187
## 29     2   4 60-99 30.1  no 199
## 30     2   5 20-39 20.4  no 113
## 31     2   6 60-99 26.3  no 184
## 32     2   7 20-39 22.5  no 118
## 33     2   8 20-39 30.1  no 187
## 34     2   9 40-59 22.0  no 238
## 35     2  10 40-59 35.3 yes 218
## 36     2  11 20-39 33.2 yes 187
## 37     2  12 40-59 20.4 yes 186
## 38     2  13 60-99 21.7  no 206
## 39     2  14 40-59 28.7 yes 204
## 40     2  15 20-39 29.6  no 187
## 41     2  16 20-39 22.5  no 238
## 42     2  17 60-99 27.2 yes 284
## 43     2  18 40-59 26.3 yes 199
## 44     2  19 20-39 35.3  no 218
## 45     2  20 60-99 25.5 yes 186
## 46     2  21 20-39 29.6  no 187
## 47     2  22 20-39 33.2  no 229
## 48     2  23 20-39 27.5  no 131
## 49     2  24 60-99 24.9  no 218
## 50     2  25 40-59 27.4  no 186
## 51     3   1 20-39 35.3 yes 199
## 52     3   2 40-59 22.7  no 187
## 53     3   3 20-39 28.7  no 187
## 54     3   4 60-99 28.7  no 184
## 55     3   5 20-39 20.4  no 113
## 56     3   6 60-99 27.4  no 184
## 57     3   7 20-39 22.5  no 118
## 58     3   8 20-39 30.1  no 187
## 59     3   9 40-59 22.0  no 238
## 60     3  10 40-59 27.4  no 187
## 61     3  11 20-39 30.1  no 131
## 62     3  12 40-59 22.5  no 131
## 63     3  13 60-99 21.7  no 206
## 64     3  14 40-59 28.7 yes 204
## 65     3  15 20-39 29.6  no 187
## 66     3  16 20-39 30.1 yes 199
## 67     3  17 60-99 27.2 yes 284
## 68     3  18 40-59 26.3 yes 199
## 69     3  19 20-39 35.3  no 218
## 70     3  20 60-99 25.5 yes 184
## 71     3  21 20-39 22.0  no 118
## 72     3  22 20-39 33.2  no 229
## 73     3  23 20-39 27.5  no 131
## 74     3  24 60-99 24.9  no 206
## 75     3  25 40-59 27.4  no 186
## 76     4   1 20-39 33.2  no 218
## 77     4   2 40-59 22.7  no 187
## 78     4   3 20-39 29.6  no 187
## 79     4   4 60-99 20.4  no 187
## 80     4   5 20-39 20.4  no 113
## 81     4   6 60-99 22.7  no 184
## 82     4   7 20-39 22.5  no 118
## 83     4   8 20-39 30.1  no 187
## 84     4   9 40-59 22.0  no 238
## 85     4  10 40-59 28.7  no 184
## 86     4  11 20-39 28.7  no 187
## 87     4  12 40-59 26.3  no 184
## 88     4  13 60-99 21.7  no 206
## 89     4  14 40-59 28.7 yes 204
## 90     4  15 20-39 29.6  no 187
## 91     4  16 20-39 30.1  no 199
## 92     4  17 60-99 27.2 yes 284
## 93     4  18 40-59 26.3 yes 199
## 94     4  19 20-39 35.3  no 218
## 95     4  20 60-99 25.5 yes 186
## 96     4  21 20-39 33.2  no 199
## 97     4  22 20-39 33.2  no 229
## 98     4  23 20-39 27.5  no 131
## 99     4  24 60-99 24.9  no 199
## 100    4  25 40-59 27.4  no 186

## call : ## with.mids(data = imp, expr = lm(bmi ~ chl)) ## ## call1 : ## mice(data = nhanes2, m = 4, printFlag = F, seed = 101, init = pred) ## ## nmis : ## age bmi hyp chl ## 0 9 8 10 ## ## analyses : ## [[1]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 22.15053 0.02338 ## ## ## [[2]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 19.9017 0.0367 ## ## ## [[3]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 20.2657 0.0371 ## ## ## [[4]] ## ## Call: ## lm(formula = bmi ~ chl) ## ## Coefficients: ## (Intercept) chl ## 19.04484 0.04203

## ## Call: ## lm(formula = bmi ~ chl) ## ## Residuals: ## Min 1Q Median 3Q Max ## -6.6367 -3.6490 -0.3548 2.8947 7.3974 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 19.90171 4.98265 3.994 0.000571 *** ## chl 0.03670 0.02496 1.470 0.155057 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 4.549 on 23 degrees of freedom ## Multiple R-squared: 0.0859, Adjusted R-squared: 0.04616 ## F-statistic: 2.161 on 1 and 23 DF, p-value: 0.1551

## estimate std.error statistic df p.value ## (Intercept) 20.34068248 4.54833393 4.472117 17.81040 0.0003016604 ## chl 0.03480344 0.02379386 1.462707 16.27191 0.1625944773

Imputation

This Lecture:

I use the following packages in this lecture

Missing data patterns

`nhanes` data

Visualise missing data pattern

Imputation methods

Specifying models in `simputation`

Regression imputation

Median imputation

Other methods

Multivariate Imputation by Chained Equations (mice)

Check imputations

Vary number of imputations

Predictor matrix

Initial predictor matrix

Change predictor matrix

Changing imputation methods

Available methods

Extracting imputed data

Pooling analyses

Other packages for imputation

Practical

Extra slides

Visualization

Visualization

Convergence

Seed values

Imputation

This Lecture:

I use the following packages in this lecture

Missing data patterns

nhanes data

Visualise missing data pattern

Imputation methods

Specifying models in simputation

Regression imputation

Median imputation

Other methods

Multivariate Imputation by Chained Equations (mice)

Check imputations

Vary number of imputations

Predictor matrix

Initial predictor matrix

Change predictor matrix

Changing imputation methods

Available methods

Extracting imputed data

Pooling analyses

Other packages for imputation

Practical

Extra slides

Visualization

Visualization

Convergence

Seed values

`nhanes` data

Specifying models in `simputation`