impute_missForest()
imputes missing values in a dataset using the missForest
method.
It allows the user to exclude certain columns from imputation and can also display the
percentage of missing values in each column before imputation. The user can also specify
the maximum number of iterations, the number of trees to grow, the type of parallelization
("no", "variables", or "forests"), as well as the number of cores to use for parallelization.
Usage
impute_missForest(
olink_data,
wide = TRUE,
maxiter = 10,
ntree = 100,
parallelize = "variables",
ncores = 4,
exclude_cols = c("DAid", "Disease"),
show_na_percentage = TRUE
)
Arguments
- olink_data
The input dataset.
- wide
If TRUE, the data is in wide format.
- maxiter
The maximum number of iterations.
- ntree
The number of trees to grow.
- parallelize
The type of parallelization to use. Options are "no", "variables", or "forests".
- ncores
The number of cores to use for parallelization.
- exclude_cols
The columns to exclude from imputation.
- show_na_percentage
If TRUE, the percentage of missing values in each column is displayed.
Examples
# Data before imputation
test_data <- example_data |>
dplyr::select(DAid, Assay, NPX) |>
tidyr::pivot_wider(names_from = "Assay", values_from = "NPX") |>
dplyr::slice_head(n = 100)
test_data
#> # A tibble: 100 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA00001 3.39 2.76 1.71 0.0333 1.76 -0.919 1.54 2.15 2.81
#> 2 DA00002 1.42 1.25 -0.816 -0.459 0.826 -0.902 0.647 1.30 0.798
#> 3 DA00003 NA NA NA 0.989 NA 0.330 1.37 NA NA
#> 4 DA00004 3.41 3.38 1.69 NA 1.52 NA 0.841 0.582 1.70
#> 5 DA00005 5.01 5.05 0.128 0.401 -0.933 -0.584 0.0265 1.16 2.73
#> 6 DA00006 6.83 1.18 -1.74 -0.156 1.53 -0.721 0.620 0.527 0.772
#> 7 DA00007 NA NA 3.96 0.682 3.14 2.62 1.47 2.25 2.01
#> 8 DA00008 2.78 0.812 -0.552 0.982 -0.101 -0.304 0.376 -0.826 1.52
#> 9 DA00009 4.39 3.34 -0.452 -0.868 0.395 1.71 1.49 -0.0285 0.200
#> 10 DA00010 1.83 1.21 -0.912 -1.04 -0.0918 -0.304 1.69 0.0920 2.04
#> # ℹ 90 more rows
#> # ℹ 91 more variables: ACTN4 <dbl>, ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>,
#> # ADAM15 <dbl>, ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>,
#> # ADAMTS16 <dbl>, ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>,
#> # ADGRG1 <dbl>, ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>,
#> # AGR3 <dbl>, AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>,
#> # AIF1 <dbl>, AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, …
# Data after imputation
impute_missForest(test_data, maxiter = 1, ntree = 50, parallelize = "no")
#> # A tibble: 88 × 2
#> column na_percentage
#> <chr> <dbl>
#> 1 ADA2 7
#> 2 ANG 7
#> 3 ANGPTL3 7
#> 4 ANPEP 7
#> 5 AOC3 7
#> 6 APOM 7
#> 7 ART3 7
#> 8 AXL 7
#> 9 ADAMTS8 6
#> 10 AHSP 6
#> # ℹ 78 more rows
#> missForest iteration 1 in progress...done!
#> estimated error(s): 0.6277277
#> difference(s): 0.00303483
#> time: 2.195 seconds
#>
#> # A tibble: 100 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2 ACTN4
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA00… 3.39 2.76 1.71 0.0333 1.76 -0.919 1.54 2.15 2.81 0.742
#> 2 DA00… 1.42 1.25 -0.816 -0.459 0.826 -0.902 0.647 1.30 0.798 -0.0659
#> 3 DA00… 2.95 2.45 1.56 0.989 0.567 0.330 1.37 1.24 1.78 0.292
#> 4 DA00… 3.41 3.38 1.69 0.402 1.52 1.15 0.841 0.582 1.70 0.108
#> 5 DA00… 5.01 5.05 0.128 0.401 -0.933 -0.584 0.0265 1.16 2.73 0.350
#> 6 DA00… 6.83 1.18 -1.74 -0.156 1.53 -0.721 0.620 0.527 0.772 0.297
#> 7 DA00… 4.19 4.11 3.96 0.682 3.14 2.62 1.47 2.25 2.01 0.170
#> 8 DA00… 2.78 0.812 -0.552 0.982 -0.101 -0.304 0.376 -0.826 1.52 -0.597
#> 9 DA00… 4.39 3.34 -0.452 -0.868 0.395 1.71 1.49 -0.0285 0.200 -0.532
#> 10 DA00… 1.83 1.21 -0.912 -1.04 -0.0918 -0.304 1.69 0.0920 2.04 0.501
#> # ℹ 90 more rows
#> # ℹ 90 more variables: ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>, ADAM15 <dbl>,
#> # ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>, ADAMTS16 <dbl>,
#> # ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>, ADGRG1 <dbl>,
#> # ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>, AGR3 <dbl>,
#> # AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>, AIF1 <dbl>,
#> # AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, AKT1S1 <dbl>, …