normalize_data()
normalizes the data by scaling them and removing their batch effects.
It first converts the data to wide format if they are not already. It then removes
the batch effects and scales or centers the data. To remove batch effects, it uses the
remove_batch_effects()
, that utilizes limma package. For scaling, it uses the scale()
from base R.
Usage
normalize_data(
olink_data,
metadata = NULL,
wide = TRUE,
center = TRUE,
scale = TRUE,
batch = NULL,
batch2 = NULL,
return_long = FALSE,
save = FALSE,
file_name = "normalized_data"
)
Arguments
- olink_data
A dataset containing Olink data to be normalized.
- metadata
A dataset containing the metadata information.
- wide
A logical value indicating whether the data is in wide format. Default is TRUE.
- center
A logical value indicating whether to center the data. Default is TRUE.
- scale
A logical value indicating whether to scale the data. Default is TRUE.
- batch
The metadata column containing the batch information. In order to correct for batch effects, this parameter should be provided. Default is NULL.
- batch2
The metadata column containing the second batch information. Default is NULL.
- return_long
A logical value indicating whether to return the data in long format. Default is FALSE.
- save
A logical value indicating whether to save the data. Default is FALSE.
- file_name
The name of the file to be saved. Default is "normalized_data".
Examples
# Non-normalized data
example_data |>
dplyr::select(DAid, Assay, NPX) |>
tidyr::pivot_wider(names_from = "Assay", values_from = "NPX")
#> # A tibble: 586 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA00001 3.39 2.76 1.71 0.0333 1.76 -0.919 1.54 2.15 2.81
#> 2 DA00002 1.42 1.25 -0.816 -0.459 0.826 -0.902 0.647 1.30 0.798
#> 3 DA00003 NA NA NA 0.989 NA 0.330 1.37 NA NA
#> 4 DA00004 3.41 3.38 1.69 NA 1.52 NA 0.841 0.582 1.70
#> 5 DA00005 5.01 5.05 0.128 0.401 -0.933 -0.584 0.0265 1.16 2.73
#> 6 DA00006 6.83 1.18 -1.74 -0.156 1.53 -0.721 0.620 0.527 0.772
#> 7 DA00007 NA NA 3.96 0.682 3.14 2.62 1.47 2.25 2.01
#> 8 DA00008 2.78 0.812 -0.552 0.982 -0.101 -0.304 0.376 -0.826 1.52
#> 9 DA00009 4.39 3.34 -0.452 -0.868 0.395 1.71 1.49 -0.0285 0.200
#> 10 DA00010 1.83 1.21 -0.912 -1.04 -0.0918 -0.304 1.69 0.0920 2.04
#> # ℹ 576 more rows
#> # ℹ 91 more variables: ACTN4 <dbl>, ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>,
#> # ADAM15 <dbl>, ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>,
#> # ADAMTS16 <dbl>, ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>,
#> # ADGRG1 <dbl>, ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>,
#> # AGR3 <dbl>, AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>,
#> # AIF1 <dbl>, AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, …
# Center data
normalize_data(example_data, example_metadata, wide = FALSE, center = TRUE, scale = FALSE)
#> # A tibble: 586 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA00001 0.259 0.949 0.697 -0.532 0.827 -1.42 0.612 1.02 1.20
#> 2 DA00002 -1.71 -0.563 -1.83 -1.02 -0.102 -1.40 -0.278 0.168 -0.813
#> 3 DA00003 NA NA NA 0.424 NA -0.167 0.447 NA NA
#> 4 DA00004 0.278 1.57 0.683 NA 0.593 NA -0.0839 -0.551 0.0892
#> 5 DA00005 1.88 3.24 -0.882 -0.165 -1.86 -1.08 -0.898 0.0236 1.12
#> 6 DA00006 3.70 -0.628 -2.75 -0.721 0.600 -1.22 -0.305 -0.606 -0.840
#> 7 DA00007 NA NA 2.95 0.117 2.21 2.12 0.548 1.12 0.398
#> 8 DA00008 -0.351 -0.998 -1.56 0.416 -1.03 -0.800 -0.549 -1.96 -0.0901
#> 9 DA00009 1.26 1.53 -1.46 -1.43 -0.533 1.21 0.562 -1.16 -1.41
#> 10 DA00010 -1.30 -0.596 -1.92 -1.60 -1.02 -0.801 0.765 -1.04 0.427
#> # ℹ 576 more rows
#> # ℹ 91 more variables: ACTN4 <dbl>, ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>,
#> # ADAM15 <dbl>, ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>,
#> # ADAMTS16 <dbl>, ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>,
#> # ADGRG1 <dbl>, ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>,
#> # AGR3 <dbl>, AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>,
#> # AIF1 <dbl>, AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, …
# Center and scale data (z-score scaling)
normalize_data(example_data, example_metadata, wide = FALSE, center = TRUE, scale = TRUE)
#> # A tibble: 586 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA00001 0.240 0.685 0.498 -0.753 0.722 -1.39 0.800 0.991 1.16
#> 2 DA00002 -1.58 -0.406 -1.30 -1.45 -0.0885 -1.37 -0.364 0.163 -0.786
#> 3 DA00003 NA NA NA 0.600 NA -0.163 0.584 NA NA
#> 4 DA00004 0.257 1.14 0.488 NA 0.517 NA -0.110 -0.536 0.0862
#> 5 DA00005 1.74 2.34 -0.629 -0.233 -1.62 -1.06 -1.18 0.0230 1.08
#> 6 DA00006 3.42 -0.453 -1.96 -1.02 0.523 -1.19 -0.399 -0.590 -0.812
#> 7 DA00007 NA NA 2.11 0.165 1.93 2.08 0.717 1.09 0.385
#> 8 DA00008 -0.325 -0.721 -1.12 0.589 -0.898 -0.783 -0.719 -1.90 -0.0871
#> 9 DA00009 1.17 1.11 -1.04 -2.03 -0.464 1.18 0.735 -1.13 -1.36
#> 10 DA00010 -1.20 -0.431 -1.37 -2.27 -0.889 -0.784 1.00 -1.01 0.413
#> # ℹ 576 more rows
#> # ℹ 91 more variables: ACTN4 <dbl>, ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>,
#> # ADAM15 <dbl>, ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>,
#> # ADAMTS16 <dbl>, ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>,
#> # ADGRG1 <dbl>, ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>,
#> # AGR3 <dbl>, AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>,
#> # AIF1 <dbl>, AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, …
# Center, scale and remove batch effects
normalize_data(example_data, example_metadata, wide = FALSE, batch = "Cohort")
#> # A tibble: 586 × 101
#> DAid AARSD1 ABL1 ACAA1 ACAN ACE2 ACOX1 ACP5 ACP6 ACTA2 ACTN4
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 DA000… 0.104 0.476 0.391 -0.779 0.544 -1.46 0.701 0.915 0.985 0.516
#> 2 DA000… -1.74 -0.656 -1.43 -1.48 -0.286 -1.44 -0.471 0.0838 -1.02 -0.761
#> 3 DA000… NA NA NA 0.574 NA -0.232 0.484 NA NA NA
#> 4 DA000… 0.121 0.944 0.380 NA 0.335 NA -0.215 -0.618 -0.122 -0.486
#> 5 DA000… 1.62 2.20 -0.746 -0.259 -1.86 -1.13 -1.29 -0.0569 0.903 -0.103
#> 6 DA000… 3.32 -0.705 -2.09 -1.05 0.341 -1.26 -0.507 -0.672 -1.05 NA
#> 7 DA000… NA NA 2.02 0.140 1.78 2.01 0.618 1.01 0.186 -0.388
#> 8 DA000… -0.468 -0.983 -1.24 0.563 -1.12 -0.854 -0.828 -1.99 -0.300 -1.60
#> 9 DA000… 1.04 0.915 -1.16 -2.05 -0.671 1.12 0.636 -1.21 -1.61 -1.50
#> 10 DA000… -1.36 -0.681 -1.49 -2.30 -1.11 -0.855 0.903 -1.10 0.214 0.136
#> # ℹ 576 more rows
#> # ℹ 90 more variables: ACY1 <dbl>, ADA <dbl>, ADA2 <dbl>, ADAM15 <dbl>,
#> # ADAM23 <dbl>, ADAM8 <dbl>, ADAMTS13 <dbl>, ADAMTS15 <dbl>, ADAMTS16 <dbl>,
#> # ADAMTS8 <dbl>, ADCYAP1R1 <dbl>, ADGRE2 <dbl>, ADGRE5 <dbl>, ADGRG1 <dbl>,
#> # ADGRG2 <dbl>, ADH4 <dbl>, ADM <dbl>, AGER <dbl>, AGR2 <dbl>, AGR3 <dbl>,
#> # AGRN <dbl>, AGRP <dbl>, AGXT <dbl>, AHCY <dbl>, AHSP <dbl>, AIF1 <dbl>,
#> # AIFM1 <dbl>, AK1 <dbl>, AKR1B1 <dbl>, AKR1C4 <dbl>, AKT1S1 <dbl>, …