Analysis of the UK Biobank neuroimaging data to build a model of brain ageing using multiple neuroimaging modalities. James Cole, 2nd October 2019. The analysis was updated to reflect further analysis in response to reviewers in March 2020.
The data were initially read using Ken Hanscombe’s ukbtools package (ukb_df tool), which reads the .tab file, .r script and .html. This is slow process for large files, hence here the data are read and than saved as .rda files, which can be loaded more swiftly for convenience.
# clear workspace
rm(list = ls())
# Load packages
library(boot)
library(corrplot)
library(cowplot)
library(dplyr)
library(ggplot2)
library(glmnet)
library(heplots)
library(hier.part)
library(Hmisc)
library(kableExtra)
library(knitr)
library(lm.beta)
library(psych)
library(pwr)
library(tidyverse)
library(ukbtools)
Looks for existing file and loads is available.
if (file.exists("ukb_data.rda")) {
cat("Loading data file", date(),sep = " ")
load(file = "ukb_data.rda")
} else {
tmp <- ukb_df("ukb23892", path = "/Volumes/home/analysis/UKBiobank/ID_23892")
tmp1 <- ukb_df("ukb26571", path = "/Volumes/home/analysis/UKBiobank/ID_26571")
tmp2 <- read.table("rs_fMRI_data.txt", colClasses = c("eid" = "character"))
df <- list(tmp, tmp1, tmp2) %>% reduce(left_join, by = "eid") # using purr
save(df, file = "ukb_data.rda")
rm(list = ls(pattern = "tmp*"))
}
## Loading data file Wed Mar 4 15:30:18 2020
Data from n = 22392 UK Biobank participants were downloaded for the study.
Field 12188 uses coding 0=no, 1=yes, -1=unknown. Subset to only include those who completed brain MRI from imaging assessment.
df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 <- as.factor(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
table(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
##
## -1 0 1
## 49 2025 20237
df <- subset(df, df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 == 1)
df[,grep("age_when_attended", names(df))] <- lapply(df[,grep("age_when_attended", names(df))], as.numeric)
df$sex <- factor(df$sexuses_datacoding_9_f31_0_0)
df$sex <- dplyr::recode(df$sex, "0" = "Female", "1" = "Male")
df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 <- factor(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0)
df$stroke_history <- factor(!is.na(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) & as.numeric(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) > 0, labels = c("no stroke", "stroke"))
var.list <- c("body_mass_index_bmi_f21001_2_0",
"diastolic_blood_pressure_automated_reading_f4079_2_0",
"diastolic_blood_pressure_automated_reading_f4079_2_1",
"systolic_blood_pressure_automated_reading_f4080_2_0",
"systolic_blood_pressure_automated_reading_f4080_2_1",
"hip_circumference_f49_2_0",
"weight_f21002_2_0",
"hand_grip_strength_left_f46_2_0",
"hand_grip_strength_right_f47_2_0",
"overall_health_ratinguses_datacoding_100508_f2178_2_0",
"longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0",
"height_f12144_2_0",
"mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0",
"mean_rfmri_head_motion_averaged_across_space_and_time_points_f25741_2_0",
"volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0",
"fluid_intelligence_score_f20016_2_0",
"duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0",
"duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0",
"number_of_puzzles_correctly_solved_f6373_2_0",
"duration_spent_answering_each_puzzle_f6333_2_0",
"number_of_puzzles_correct_f6382_2_0",
"duration_of_moderate_activityuses_datacoding_100291_f894_2_0",
"duration_of_vigorous_activityuses_datacoding_100291_f914_2_0")
df[,var.list] <- lapply(df[,var.list], as.numeric)
imaging.vars.list <- grep("forced|nifti|discrepancy", grep("volume|t2star|t2_flair|bold|activation|skeleton|tract|Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
df[,grep("volume", names(df))] <- lapply(df[,grep("volume", names(df))], as.numeric)
df[,grep("t2star", names(df))] <- lapply(df[,grep("t2star", names(df))], as.numeric)
df[,grep("t2_flair", names(df))] <- lapply(df[,grep("t2_flair", names(df))], as.numeric)
## Warning in lapply(df[, grep("t2_flair", names(df))], as.numeric): NAs
## introduced by coercion
df[,grep("bold", names(df))] <- lapply(df[,grep("bold", names(df))], as.numeric)
df[,grep("skeleton", names(df))] <- lapply(df[,grep("skeleton", names(df))], as.numeric)
df[,grep("tract", names(df))] <- lapply(df[,grep("tract", names(df))], as.numeric)
df[,grep("activation", names(df))] <- lapply(df[,grep("activation", names(df))], as.numeric)
Instance 0 is the baseline demographic clinical visit from 2006-2010. Instance 1 is a follow-up from 2012, no imaging then. Exclude people without complete imaging data.
table(complete.cases(df[,c(imaging.vars.list)]))
##
## FALSE TRUE
## 2776 17461
df <- df[complete.cases(df[,c(imaging.vars.list)]),]
df$age_at_scan <- df$age_when_attended_assessment_centre_f21003_2_0
describe(df$age_at_scan)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 17461 62.46 7.42 63 62.57 8.9 45 80 35 -0.14 -0.89
## se
## X1 0.06
table(df$sex)
##
## Female Male
## 9274 8187
ggplot(df, aes(age_at_scan)) +
geom_histogram(binwidth = 1, fill = "darkgoldenrod2", colour = "black", lwd = 0.25) +
xlab('Age at scan (years)') +
theme_cowplot()
Function to generate a data.frame with TRUE or FALSE for the presence/absence of ICD diagnosis. Adapted from Ken’s ukb_icd_diagnosis() function.
has_icd.diagnosis <- function(id, icd.version){
x <- df %>% dplyr::filter(eid %in% id) %>%
dplyr::select(matches(paste("^diagnoses.*icd", icd.version, sep = ""))) %>%
dplyr::select_if(colSums(!is.na(.)) > 0) %>% ncol() != 0
return(data.frame(id, x))
}
Running on the whole dataset takes >4 hours for n>20,000. So load .RDA file if avaialble.
if (file.exists("ukb_icd.diagnosis_data.rda")) {
load("ukb_icd.diagnosis_data.rda")
} else {
has_icd.diagnosis.df <- do.call(rbind, lapply(df$eid, function(x) has_icd.diagnosis(x, 10)))
save(has_icd.diagnosis.df, file = "ukb_icd.diagnosis_data.rda")
}
Merge to add health status variable.
names(has_icd.diagnosis.df) <- c("eid", "icd_positive")
df <- merge(df, has_icd.diagnosis.df)
table(df$icd_positive)
##
## FALSE TRUE
## 4708 12753
Check number of ICD-positive against subjective health and longstanding illness.
table(df$overall_health_ratinguses_datacoding_100508_f2178_2_0, df$icd_positive)
##
## FALSE TRUE
## -3 0 3
## -1 3 16
## 1 1180 2141
## 2 3018 7963
## 3 436 2244
## 4 36 307
table(df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0, df$icd_positive)
##
## FALSE TRUE
## -3 1 14
## -1 40 196
## 0 3899 8723
## 1 733 3741
Include people with no ICD-10 diagnosis, no self-reported long-standing illness disability or infirmity (F2188) and good or excellent self-reported health (F2178). Remove subjects with NAs in the age column and with the missing imaging variables.
df$healthy <- ifelse(df$icd_positive == "FALSE" & df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0 == 0 & df$overall_health_ratinguses_datacoding_100508_f2178_2_0 >= 2 & df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 == 0 & df$stroke_history != "stroke", "healthy", "non-healthy")
healthy.df <- subset(df, df$healthy == "healthy")
table(is.na(df$healthy))
##
## FALSE TRUE
## 17426 35
Run descriptive statistics
describeBy(df$age_at_scan, df$healthy)
##
## Descriptive statistics by group
## group: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2725 61.47 7.21 62 61.48 8.9 46 79 33 -0.02 -0.9
## se
## X1 0.14
## --------------------------------------------------------
## group: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 14701 62.64 7.45 63 62.78 8.9 45 80 35 -0.16 -0.88
## se
## X1 0.06
table(df$sex, df$healthy)
##
## healthy non-healthy
## Female 1343 7914
## Male 1382 6787
round(prop.table(table(df$sex, df$healthy), 2),3)
##
## healthy non-healthy
## Female 0.493 0.538
## Male 0.507 0.462
ggplot(subset(df, !is.na(df$healthy)), aes(age_at_scan, color = healthy)) +
geom_density(aes(fill = healthy), alpha = 0.5) +
xlab("Age at MRI scan (years)") +
scale_fill_brewer(palette = "Set1") +
scale_color_brewer(palette = "Set1") +
theme_cowplot()
Lots of missing data here, around 50% NAs Coding here: http://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001
table(is.na(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0))
##
## FALSE TRUE
## 8563 8898
table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy)
##
## healthy non-healthy
## -1 1 1
## -3 3 9
## 1 1 1
## 1001 1236 6694
## 1002 27 188
## 1003 33 144
## 2 0 1
## 2001 2 11
## 2002 1 4
## 2003 1 7
## 2004 5 12
## 3001 11 46
## 3002 1 13
## 3003 1 0
## 3004 2 11
## 4001 1 23
## 4002 2 16
## 5 4 17
## 6 5 28
round(100*prop.table(table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy), 2),2)
##
## healthy non-healthy
## -1 0.07 0.01
## -3 0.22 0.12
## 1 0.07 0.01
## 1001 92.45 92.64
## 1002 2.02 2.60
## 1003 2.47 1.99
## 2 0.00 0.01
## 2001 0.15 0.15
## 2002 0.07 0.06
## 2003 0.07 0.10
## 2004 0.37 0.17
## 3001 0.82 0.64
## 3002 0.07 0.18
## 3003 0.07 0.00
## 3004 0.15 0.15
## 4001 0.07 0.32
## 4002 0.15 0.22
## 5 0.30 0.24
## 6 0.37 0.39
BMI
by(df$body_mass_index_bmi_f21001_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2663 26.44 4.16 25.94 26.12 3.74 16.4 58.04 41.65 1.05 2.77
## se Q0.25 Q0.75
## 1 0.08 23.58 28.68
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 14348 26.6 4.41 25.94 26.22 3.87 13.39 55.23 41.84 1.05 2.18
## se Q0.25 Q0.75
## 1 0.04 23.56 28.88
Weight
by(df$weight_f21002_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2665 76.3 14.8 75 75.57 14.53 41.6 179.8 138.2 0.73 1.81
## se Q0.25 Q0.75
## 1 0.29 65.6 85.4
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew
## 1 1 14382 76.14 15.13 74.6 75.15 14.68 33 169.2 136.2 0.79
## kurtosis se Q0.25 Q0.75
## 1 1.27 0.13 65.2 85
Hip circumference
by(df$hip_circumference_f49_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2667 100.98 8.19 100 100.49 7.41 80 152 72 0.83 1.92
## se Q0.25 Q0.75
## 1 0.16 95.5 105
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 14400 101.39 8.75 100 100.77 7.41 68 156 88 0.97 2.36
## se Q0.25 Q0.75
## 1 0.07 96 106
print("Diastolic")
## [1] "Diastolic"
by(df$diastolic_blood_pressure_automated_reading_f4079_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2254 79.67 10.61 79 79.52 10.38 43 114 71 0.14 -0.03
## se Q0.25 Q0.75
## 1 0.22 72 87
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 12097 78.35 10.53 78 78.1 10.38 38 128 90 0.27 0.17
## se Q0.25 Q0.75
## 1 0.1 71 85
print("Systolic")
## [1] "Systolic"
by(df$systolic_blood_pressure_automated_reading_f4080_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 2254 139.55 18.95 139 138.86 19.27 81 212 131 0.37 0.06
## se Q0.25 Q0.75
## 1 0.4 126 151.75
## --------------------------------------------------------
## df$healthy: non-healthy
## vars n mean sd median trimmed mad min max range skew kurtosis
## 1 1 12097 138.43 18.94 137 137.7 19.27 78 228 150 0.43 0.38
## se Q0.25 Q0.75
## 1 0.17 125 150
table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy)
##
## healthy non-healthy
## -1 0 24
## -3 0 15
## 0 2725 13747
## 1 0 836
round(100*prop.table(table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy), 2),2)
##
## healthy non-healthy
## -1 0.00 0.16
## -3 0.00 0.10
## 0 100.00 94.02
## 1 0.00 5.72
table(df$stroke_history, df$healthy)
##
## healthy non-healthy
## no stroke 2725 14500
## stroke 0 201
round(100*prop.table(table(df$stroke_history, df$healthy), 2),2)
##
## healthy non-healthy
## no stroke 100.00 98.63
## stroke 0.00 1.37
Use 80% for training and 20% for validation/testing.
# Determine sample size and assign training/test group variables
set.seed(1982)
index <- sample(2, nrow(healthy.df), replace = TRUE, prob = c(0.8, 0.2))
table(index)
## index
## 1 2
## 2205 520
# Split data into training/testing and keep imaging variables only
train_data <- healthy.df[index == 1, imaging.vars.list]
test_data <- healthy.df[index == 2, imaging.vars.list]
# Define objects with age labels for training and test sets
train_labels <- healthy.df[index == 1, "age_at_scan"]
test_labels <- healthy.df[index == 2, "age_at_scan"]
describe(train_labels)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2205 61.54 7.24 62 61.54 8.9 46 79 33 -0.03 -0.89
## se
## X1 0.15
describe(test_labels)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 520 61.2 7.07 61 61.2 8.9 47 78 31 0 -0.97 0.31
par(mfrow = c(1,2))
hist(train_labels, breaks = 20, col = "darkgoldenrod2", main = "Training set age", xlab = "Age at scan (years)")
hist(test_labels, breaks = 20, col = "darkgoldenrod2", main = "Validation set age", xlab = "Age at scan (years)")
par(mfrow = c(1,1))
This is essential for ANN and probably a good idea for all the models. #### Important, need to apply scaling parameters to new data
scaled.train_data <- scale(train_data, scale = TRUE, center = TRUE)
scaling.parameters.center <- attr(scaled.train_data, "scaled:center")
scaling.parameters.scale <- attr(scaled.train_data, "scaled:scale")
scaled.test_data <- as.data.frame(scale(test_data, scaling.parameters.center, scaling.parameters.scale))
scaled.train_data <- as.data.frame(scaled.train_data)
Plot of n=1079 variables using the index order (which is basically arbitrary). With n = 2725, very small r values will be significant at 0.05. Bonferroni adjusted pvalues need to be below 0.05/1079 = 4.633920310^{-5}.
alpha <- 0.05
power <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha, power = 0.8, alternative = "greater")
power.bonf <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha/length(imaging.vars.list), power = 0.8, alternative = "greater")
plot(sapply(healthy.df[imaging.vars.list], function(x) cor(x, healthy.df$age_at_scan)),
type = "h",
col = "darkgoldenrod2",
ylab = "Pearson's r with age");abline(h = 0);abline(h = power$r, lty = "dashed", col = "grey40");abline(h = (0 - power$r), lty = "dashed", col = "grey40"); abline(h = power.bonf$r, col = "darkred", lty = 2); abline(h = (0 - power.bonf$r), col = "darkred", lty = 2)
text(x = 0, y = 0.065, "Uncorrected p = 0.05", col = "grey40", adj = 0, cex = 0.8)
text(x = 0, y = 0.13, "Bonferroni p = 0.05", col = "darkred", adj = 0, cex = 0.8)
Of the univariate correlations with age, 886 are significant at p = 0.05. When using Bonferroni correction 704 are significant at p = 0.05.
Take predicted age values as input.
test_results <- function(pred) {
r <- cor.test(test_labels, pred)$estimate
r.sq <- summary(lm(test_labels ~ pred))$r.squared
MAE <- mean(abs(pred - test_labels), na.rm = T)
age.bias <- cor.test(test_labels, (pred - test_labels))$estimate
value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
return(kable(results) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre")))
}
age_plot <- function(pred) {
ggplot() +
geom_abline(slope = 1, intercept = 0) +
geom_point(aes(x = test_labels, y = pred), shape = 21, bg = "darkgoldenrod2", size = 2) +
geom_smooth(aes(x = test_labels, y = pred), method = "lm", col = "darkgrey") +
labs(title = deparse(substitute(pred)), x = "Age (years)", y = "Brain-predicted age (years)") +
theme_cowplot()
}
Using the glmnet package. Alpha = 1 is for LASSO penalisation (0 = ridge, 0.5 = elastic net).
x.train <- as.matrix(scaled.train_data)
dimnames(x.train) <- NULL
y.train <- as.matrix(train_labels)
## cross-validation for lambda
set.seed(1883)
lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train,
alpha = 1, family = "gaussian")
Plot results. The minimum lambda value is 0.05, while the optimal lambda value (i.e., the highest value within 1 standard error of the minimum) is 0.127.
plot(lasso.fit.cv)
Fit model using previously optimised (through CV) lambda value (1 SE value, not minimum).
lasso.fit <- glmnet(x = x.train, y = y.train,
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
test_results(lasso.pred)
value | ||
---|---|---|
cor | r | 0.78 |
R^2 | 0.609 | |
MAE | 3.549 | |
cor | Age.bias | -0.671 |
age_plot(lasso.pred)
ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/brain_age_scatterplot.pdf", useDingbats = FALSE, dpi = 75, height = 4, width = 4)
LASSO.coefficient <- coef(lasso.fit, s = lasso.fit.cv$lambda.1se)[-1]
var.coefs <- data.frame(imaging.vars.list, LASSO.coefficient)
non.zero_vars <- subset(var.coefs, var.coefs$LASSO.coefficient != 0)
non.zero_vars$imaging.vars.list <- factor(non.zero_vars$imaging.vars.list)
Out of the original 1079 variables, the LASSO regression set 188 to non-zero, thus 891 variables were removed.
Bootstrap 95% confidence intervals. Uses the boot package.
Essential to convert coefficients to vector that stores zeros.
lasso.coef <- function(data, indices) {
d <- data[indices,]
fit <- glmnet(x = d[,-1], y = d[,1],
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
return(coef(fit)[,1])
}
Normal printing and plotting of results doesn’t work for high-dimensional datasets. Load data file if it already exists.
if (file.exists("lasso.boot.out.rda")) {
load("lasso.boot.out.rda")
cat("loading existing bootstrap file")
} else {
cat("running bootstraps")
boot.out <- boot(data = cbind(y.train, x.train), statistic = lasso.coef, R = 1000)
save(boot.out, file = "lasso.boot.out.rda")
}
## loading existing bootstrap file
There were 221 non-zero coefficients.
Check histogram of bootstrap coefficients for top variable by way of example.
ggplot() +
geom_histogram(bins = 100, aes(boot.out$t[,which.max(abs(boot.out$t0[-1])) + 1]),
fill = "darkgoldenrod2",
colour = "black",
lwd = 0.25) +
xlab("Top variable bootstrapped coefficients") +
theme_cowplot()
ci.vector <- function(index, boot.object, ci.type) {
x <- boot.ci(boot.object, type = ci.type, index = index)
return(x[4])
}
Use my ci.vector() function (defined above) to derive confidence intervals.
n <- length(boot.out$t0)
boot.ci.out <- sapply(1:n, ci.vector, boot.object = boot.out, ci.type = "basic")
x <- boot.out$t0[1:n]
y <- data.frame(t(matrix(unlist(boot.ci.out), ncol = n)))[4:5]
ci.df <- cbind(x, y)
names(ci.df) <- c("coef", "l.ci", "u.ci")
Identify variables with confidence intervals that do not overlap zero.
# drop intercept from plot using [-1] in vector ci.df$l.ci and ci.df$u.ci (i.e., the intercept is the top row)
sig.vars.index <- which(ci.df$l.ci[-1] > 0 | ci.df$u.ci[-1] < 0)
sig.vars.list <- imaging.vars.list[sig.vars.index]
sig.vars.df <- ci.df[sig.vars.index + 1,] ## add 1 to omit intercept row
sig.vars.df <- cbind(sig.vars.list, round(sig.vars.df,3))
kable(sig.vars.df[order(abs(sig.vars.df$coef), decreasing = T),]) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre"), fixed_thead = list(enabled = T, background = "lightgrey"))
sig.vars.list | coef | l.ci | u.ci | |
---|---|---|---|---|
V6 | volume_of_grey_matter_normalised_for_head_size_f25005_2_0 | -1.466 | -1.851 | -1.242 |
V800 | weightedmean_icvf_in_tract_forceps_minor_f25661_2_0 | -1.029 | -1.789 | -0.699 |
V26 | volume_of_brain_stem_4th_ventricle_f25025_2_0 | 0.692 | 0.484 | 1.139 |
V164 | volume_of_grey_matter_in_ventral_striatum_left_f25890_2_0 | -0.666 | -1.022 | -0.393 |
V711 | weightedmean_l1_in_tract_anterior_thalamic_radiation_right_f25572_2_0 | 0.513 | 0.218 | 1.026 |
V584 | mean_isovf_in_fornix_on_fa_skeleton_f25445_2_0 | 0.501 | 0.219 | 1.003 |
V195 | mean_fa_in_middle_cerebellar_peduncle_on_fa_skeleton_f25056_2_0 | -0.477 | -0.910 | -0.275 |
V157 | volume_of_grey_matter_in_putamen_right_f25883_2_0 | 0.460 | 0.262 | 0.911 |
V13 | volume_of_thalamus_right_f25012_2_0 | -0.454 | -0.909 | -0.116 |
V210 | mean_fa_in_cerebral_peduncle_on_fa_skeleton_left_f25071_2_0 | -0.453 | -0.906 | -0.226 |
V208 | mean_fa_in_superior_cerebellar_peduncle_on_fa_skeleton_left_f25069_2_0 | 0.429 | 0.053 | 0.859 |
V339 | mean_l1_in_middle_cerebellar_peduncle_on_fa_skeleton_f25200_2_0 | -0.428 | -0.815 | -0.162 |
V463 | mean_l3_in_posterior_thalamic_radiation_on_fa_skeleton_right_f25324_2_0 | 0.418 | 0.129 | 0.835 |
V426 | mean_l2_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25287_2_0 | 0.393 | 0.062 | 0.787 |
V486 | mean_icvf_in_body_of_corpus_callosum_on_fa_skeleton_f25347_2_0 | 0.381 | 0.031 | 0.762 |
V356 | mean_l1_in_anterior_limb_of_internal_capsule_on_fa_skeleton_left_f25217_2_0 | 0.349 | 0.095 | 0.698 |
V330 | mean_mo_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25191_2_0 | -0.336 | -0.610 | -0.061 |
V173 | volume_of_grey_matter_in_vi_cerebellum_right_f25899_2_0 | -0.330 | -0.661 | -0.059 |
V137 | volume_of_grey_matter_in_frontal_operculum_cortex_right_f25863_2_0 | -0.321 | -0.582 | -0.146 |
V572 | mean_od_in_superior_longitudinal_fasciculus_on_fa_skeleton_left_f25433_2_0 | 0.316 | 0.169 | 0.633 |
V178 | volume_of_grey_matter_in_vermis_crus_ii_cerebellum_f25904_2_0 | 0.306 | 0.115 | 0.532 |
V787 | weightedmean_l3_in_tract_uncinate_fasciculus_left_f25648_2_0 | 0.302 | 0.002 | 0.603 |
V16 | volume_of_putamen_left_f25015_2_0 | -0.284 | -0.567 | -0.011 |
V279 | mean_md_in_cingulum_hippocampus_on_fa_skeleton_right_f25140_2_0 | -0.272 | -0.544 | -0.109 |
V391 | mean_l2_in_splenium_of_corpus_callosum_on_fa_skeleton_f25252_2_0 | -0.271 | -0.542 | -0.014 |
V683 | weightedmean_mo_in_tract_anterior_thalamic_radiation_left_f25544_2_0 | 0.263 | 0.065 | 0.526 |
V48 | 90th_percentile_of_bold_effect_in_groupdefined_mask_for_shapes_activation_f25761_2_0 | 0.229 | 0.050 | 0.425 |
V692 | weightedmean_mo_in_tract_forceps_minor_f25553_2_0 | -0.226 | -0.452 | -0.031 |
V31 | median_t2star_in_putamen_left_f25030_2_0 | -0.221 | -0.442 | -0.016 |
V153 | volume_of_grey_matter_in_thalamus_right_f25879_2_0 | 0.217 | 0.005 | 0.435 |
V45 | median_bold_effect_in_groupdefined_mask_for_facesshapes_contrast_f25048_2_0 | -0.212 | -0.423 | -0.060 |
V714 | weightedmean_l1_in_tract_parahippocampal_part_of_cingulum_left_f25575_2_0 | -0.174 | -0.348 | -0.001 |
V1026 | Partial_corr_25_dim_25752_2_0_v157 | 0.165 | 0.024 | 0.329 |
V421 | mean_l2_in_cingulum_cingulate_gyrus_on_fa_skeleton_right_f25282_2_0 | -0.119 | -0.237 | -0.009 |
## sort dataset by coefficient
ci.df2 <- ci.df[order(ci.df$coef, decreasing = T),]
# drop intercept from plot using [-1,] in data.frame ci.df (i.e., the intercept is the top row)
plot(ci.df2[-1,1], ylim = c(min(ci.df2[-1,2]), max(ci.df2[-1,3])),
pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient") +
arrows(x0 = 1:(n - 1), y0 = ci.df2[-1,2], y1 = ci.df2[-1,3],
length = 0.02, angle = 90, code = 3, col = "grey") +
abline(h = 0, type = 2)
## integer(0)
There are 34 variables with CIs that don’t overlap zero.
## sort dataset by coefficient
sig.vars.df2 <- sig.vars.df[order(sig.vars.df$coef, decreasing = T),]
opar <- par()
par(mar = c(15, 4, 1, 2))
axis_labels <- gsub("_f....._2_0", "", sig.vars.df2[,1])
plot(sig.vars.df2$coef, ylim = c(min(sig.vars.df2$l.ci),max(sig.vars.df2$u.ci)),
pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient", xaxt = "n", xlab = "") +
arrows(x0 = 1:dim(sig.vars.df2)[1], y0 = sig.vars.df2$l.ci, y1 = sig.vars.df2$u.ci,
length = 0.02, angle = 90, code = 3, col = "grey") +
abline(h = 0, type = 2)
## integer(0)
axis(side = 1, at = 1:length(sig.vars.list), labels = axis_labels, las = 2, cex.axis = 0.8)
par(opar)
top.ols <- lm(train_labels ~ .,
data = scaled.train_data[,sig.vars.index])
top.ols.pred <- predict(object = top.ols, newdata = scaled.test_data[,sig.vars.index])
test_results(top.ols.pred)
value | ||
---|---|---|
cor | r | 0.734 |
R^2 | 0.538 | |
MAE | 3.817 | |
cor | Age.bias | -0.59 |
age_plot(top.ols.pred)
## fit model using optimal lambda value (1 SE value, not minimum)
set.seed(1883)
top.lasso.fit <- glmnet(x = x.train[,sig.vars.index], y = y.train,
alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
top.lasso.pred <- predict(top.lasso.fit, newx = as.matrix(scaled.test_data[,sig.vars.index]))
test_results(top.lasso.pred)
value | ||
---|---|---|
cor | r | 0.735 |
R^2 | 0.54 | |
MAE | 3.792 | |
cor | Age.bias | -0.677 |
age_plot(top.lasso.pred)
Using the same training/test split as above.
run_lasso <- function(list) {
set.seed(1883)
train_data <- healthy.df[index == 1, list]
test_data <- healthy.df[index == 2, list]
scaled.train_data <- as.data.frame(scale(train_data))
scaled.test_data <- as.data.frame(scale(test_data))
x.train <- as.matrix(scaled.train_data)
dimnames(x.train) <- NULL
y.train <- as.matrix(train_labels)
lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian") ## cross-validation for lambda
lasso.fit <- glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
dimnames(lasso.pred)[[2]] <- deparse(substitute(list))
return(lasso.pred)
}
t1.vars.list <- grep("forced|nifti|discrepancy|t2_flair", grep("volume", names(df), value = T), invert = T, value = T)
t1.pred <- run_lasso(t1.vars.list)
test_results(t1.pred)
value | ||
---|---|---|
cor | r | 0.684 |
R^2 | 0.468 | |
MAE | 4.14 | |
cor | Age.bias | -0.721 |
Only one variable, just use OLS regression.
t2.ols <- lm(train_labels ~ total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0,
data = scaled.train_data)
t2.ols.pred <- predict(object = t2.ols, newdata = scaled.test_data)
test_results(t2.ols.pred)
value | ||
---|---|---|
cor | r | 0.308 |
R^2 | 0.095 | |
MAE | 5.653 | |
cor | Age.bias | -0.942 |
t2star.vars.list <- grep("forced|nifti|discrepancy", grep("t2star", names(df), value = T), invert = T, value = T)
t2star.pred <- run_lasso(t2star.vars.list)
test_results(t2star.pred)
value | ||
---|---|---|
cor | r | 0.329 |
R^2 | 0.108 | |
MAE | 5.78 | |
cor | Age.bias | -0.987 |
diffusion.vars.list <- grep("forced|nifti|discrepancy", grep("skeleton|tract", names(df), value = T), invert = T, value = T)
diffusion.pred <- run_lasso(diffusion.vars.list)
test_results(diffusion.pred)
value | ||
---|---|---|
cor | r | 0.73 |
R^2 | 0.533 | |
MAE | 3.897 | |
cor | Age.bias | -0.638 |
task.vars.list <- grep("forced|nifti|discrepancy", grep("bold|activation", names(df), value = T), invert = T, value = T)
task.pred <- run_lasso(task.vars.list)
test_results(task.pred)
value | ||
---|---|---|
cor | r | 0.161 |
R^2 | 0.026 | |
MAE | 5.929 | |
cor | Age.bias | -0.986 |
rest.vars.list <- grep("forced|nifti|discrepancy", grep("Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
rest.pred <- run_lasso(rest.vars.list)
test_results(rest.pred)
value | ||
---|---|---|
cor | r | 0.44 |
R^2 | 0.194 | |
MAE | 5.261 | |
cor | Age.bias | -0.921 |
pred.mat <- cbind(test_labels, t1.pred, t2.ols.pred, t2star.pred, diffusion.pred, task.pred, rest.pred)
colnames(pred.mat) <- c("Age", "T1-weighted","T2-FLAIR","T2*","Diffusion","Task fMRI","Resting-state fMRI")
# col1 <- colorRampPalette(brewer.pal(n = 10, name = "RdBu"))
col1 <- colorRampPalette(colors = c("firebrick", "white", "dodgerblue"))
cairo_pdf(filename = "~/Work/Articles/Brain age/UK Biobank multi-modal brain age/variable_corrplot.pdf")
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
addCoef.col = "black", tl.col = "black")
dev.off()
## quartz_off_screen
## 2
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
addCoef.col = "black", tl.col = "black")
not.t1.list <- c("total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", t2star.vars.list, diffusion.vars.list, task.vars.list, rest.vars.list)
not.t1.pred <- run_lasso(not.t1.list)
test_results(not.t1.pred)
value | ||
---|---|---|
cor | r | 0.751 |
R^2 | 0.565 | |
MAE | 3.752 | |
cor | Age.bias | -0.654 |
not.t2.list <- c(t1.vars.list, t2star.vars.list, diffusion.vars.list, task.vars.list, rest.vars.list)
not.t2.pred <- run_lasso(not.t2.list)
test_results(not.t2.pred)
value | ||
---|---|---|
cor | r | 0.778 |
R^2 | 0.605 | |
MAE | 3.572 | |
cor | Age.bias | -0.642 |
not.t2star.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", diffusion.vars.list, task.vars.list, rest.vars.list)
not.t1.pred <- run_lasso(not.t2star.list)
test_results(not.t1.pred)
value | ||
---|---|---|
cor | r | 0.773 |
R^2 | 0.598 | |
MAE | 3.598 | |
cor | Age.bias | -0.641 |
not.diffusion.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", t2star.vars.list, task.vars.list, rest.vars.list)
not.diffusion.pred <- run_lasso(not.diffusion.list)
test_results(not.diffusion.pred)
value | ||
---|---|---|
cor | r | 0.715 |
R^2 | 0.511 | |
MAE | 3.975 | |
cor | Age.bias | -0.692 |
not.task.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", t2star.vars.list, diffusion.vars.list, rest.vars.list)
not.task.pred <- run_lasso(not.task.list)
test_results(not.task.pred)
value | ||
---|---|---|
cor | r | 0.781 |
R^2 | 0.61 | |
MAE | 3.569 | |
cor | Age.bias | -0.634 |
not.rest.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", t2star.vars.list, diffusion.vars.list, task.vars.list)
not.rest.pred <- run_lasso(not.rest.list)
test_results(not.rest.pred)
value | ||
---|---|---|
cor | r | 0.779 |
R^2 | 0.607 | |
MAE | 3.522 | |
cor | Age.bias | -0.615 |
Define non-healthy people as testing set.
non.healthy.df <- subset(df, df$healthy == "non-healthy")
table(complete.cases(non.healthy.df[,imaging.vars.list]))
##
## TRUE
## 14701
kable(describe(non.healthy.df$age_at_scan), digits = 2) %>% kable_styling(bootstrap_options = c("striped", "condensed"))
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 14701 | 62.64 | 7.45 | 63 | 62.78 | 8.9 | 45 | 80 | 35 | -0.16 | -0.88 | 0.06 |
describeBy(non.healthy.df$age_at_scan, non.healthy.df$sex)
##
## Descriptive statistics by group
## group: Female
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 7914 61.87 7.33 62 61.93 8.9 45 80 35 -0.07 -0.89
## se
## X1 0.08
## --------------------------------------------------------
## group: Male
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 6787 63.54 7.49 65 63.8 7.41 45 80 35 -0.29 -0.82
## se
## X1 0.09
Scale new subjects variables using the scaling parameters from the training set.
scaled.non.healthy.test <- as.data.frame(scale(non.healthy.df[,imaging.vars.list], scaling.parameters.center, scaling.parameters.scale))
lasso.non.healthy.pred <- as.numeric(predict(lasso.fit, newx = as.matrix(scaled.non.healthy.test)))
non.healthy.labels <- non.healthy.df$age_at_scan
test_results2 <- function(pred, labels) {
r <- cor.test(labels, pred)$estimate
r.sq <- summary(lm(labels ~ pred))$r.squared
MAE <- mean(abs(pred - labels), na.rm = T)
age.bias <- cor.test(labels, (pred - labels))$estimate
value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
return(kable(results) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre")))
}
test_results2(lasso.non.healthy.pred, non.healthy.labels)
value | ||
---|---|---|
cor | r | 0.803 |
R^2 | 0.644 | |
MAE | 3.555 | |
cor | Age.bias | -0.64 |
ggplot() +
geom_abline(slope = 1, intercept = 0) +
geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha = 0.25) +
geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), method = "lm", col = "darkgrey") +
xlab("Age (years)") +
ylab("Brain-predicted age (years)") +
theme_cowplot()
Calculate age bias in initial test data.
bias.model <- lm(lasso.pred ~ test_labels)
bias.model$coefficients[1]
## (Intercept)
## 25.72573
bias.model$coefficients[2]
## test_labels
## 0.5797221
Apply correction to new (non-healthy) test data. Subtract the intercept and then divide by the slope
lasso.non.healthy.pred.corrected <- (lasso.non.healthy.pred - bias.model$coefficients[1]) / bias.model$coefficients[2]
test_results2(lasso.non.healthy.pred.corrected, non.healthy.labels)
value | ||
---|---|---|
cor | r | 0.803 |
R^2 | 0.644 | |
MAE | 4.631 | |
cor | Age.bias | 0.083 |
ggplot() +
geom_abline(slope = 1, intercept = 0) +
geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha = 0.25) +
geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), method = "lm", col = "darkgrey") +
xlab("Age (years)") +
ylab("Brain-predicted age (years)") +
theme_cowplot()
Using cowplot package
plot1 <- ggplot() +
geom_abline(slope = 1, intercept = 0, lty = 2) +
geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha = 0.25) +
geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), method = "lm", col = "darkgrey") +
xlab("Age (years)") +
ylab("Brain-predicted age (years)") +
theme_cowplot()
plot2 <- ggplot() +
geom_abline(slope = 1, intercept = 0, lty = 2) +
geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), shape = 16, col = "#0ebdee", size = 1.5, alpha = 0.25) +
geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), method = "lm", col = "darkgrey") +
xlab("Age (years)") +
ylab("Bias-adjusted age (years)") +
theme_cowplot()
plot_grid(plot1, plot2, ncol = 2, labels = "AUTO")
ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/bias_correction_scatterplot.pdf", plot_grid(plot1, plot2, ncol = 2, labels = "AUTO"), useDingbats = FALSE, dpi = 100, height = 4, width = 8)
Define brain-PAD and look at descriptives
non.healthy.df$brainPAD <- lasso.non.healthy.pred.corrected - non.healthy.df$age_at_scan
kable(describe(non.healthy.df$brainPAD), digits = 2) %>% kable_styling(bootstrap_options = c("striped", "condensed"))
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 14701 | 0.58 | 5.92 | 0.33 | 0.44 | 5.66 | -24.26 | 38.27 | 62.53 | 0.35 | 0.87 | 0.05 |
hist(non.healthy.df$brainPAD, breaks = 25, col = "darkgoldenrod2", xlab = "Brain-PAD (years)")
Decide what to use as covariates for brain-PAD
non.healthy.df$head_motion_tfmri <- non.healthy.df$mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0
m1 <- (lm(brainPAD ~ poly(age_at_scan, 2, raw = F) +
sex +
height_f12144_2_0 +
volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
head_motion_tfmri,
data = non.healthy.df))
summary(m1)
##
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex +
## height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
## head_motion_tfmri, data = non.healthy.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.102 -3.802 -0.206 3.577 35.068
##
## Coefficients:
## Estimate
## (Intercept) -3.654316
## poly(age_at_scan, 2, raw = F)1 17.099857
## poly(age_at_scan, 2, raw = F)2 59.098200
## sexMale 0.929932
## height_f12144_2_0 -0.016317
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 2.893481
## head_motion_tfmri 19.543429
## Std. Error
## (Intercept) 1.649942
## poly(age_at_scan, 2, raw = F)1 6.030984
## poly(age_at_scan, 2, raw = F)2 5.731581
## sexMale 0.148847
## height_f12144_2_0 0.007846
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.519682
## head_motion_tfmri 0.752032
## t value
## (Intercept) -2.215
## poly(age_at_scan, 2, raw = F)1 2.835
## poly(age_at_scan, 2, raw = F)2 10.311
## sexMale 6.248
## height_f12144_2_0 -2.080
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 5.568
## head_motion_tfmri 25.987
## Pr(>|t|)
## (Intercept) 0.02679
## poly(age_at_scan, 2, raw = F)1 0.00458
## poly(age_at_scan, 2, raw = F)2 < 2e-16
## sexMale 4.28e-10
## height_f12144_2_0 0.03757
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 2.63e-08
## head_motion_tfmri < 2e-16
##
## (Intercept) *
## poly(age_at_scan, 2, raw = F)1 **
## poly(age_at_scan, 2, raw = F)2 ***
## sexMale ***
## height_f12144_2_0 *
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 ***
## head_motion_tfmri ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.724 on 14694 degrees of freedom
## Multiple R-squared: 0.0654, Adjusted R-squared: 0.06502
## F-statistic: 171.4 on 6 and 14694 DF, p-value: < 2.2e-16
lm.beta(m1)
##
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex +
## height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
## head_motion_tfmri, data = non.healthy.df)
##
## Standardized Coefficients::
## (Intercept)
## 0.00000000
## poly(age_at_scan, 2, raw = F)1
## 0.02382672
## poly(age_at_scan, 2, raw = F)2
## 0.08234668
## sexMale
## 0.07832250
## height_f12144_2_0
## -0.02577438
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0
## 0.05963084
## head_motion_tfmri
## 0.21564003
etasq(m1, partial = T)
## Partial eta^2
## poly(age_at_scan, 2, raw = F) 0.0076814828
## sex 0.0026492783
## height_f12144_2_0 0.0002942559
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.0021052835
## head_motion_tfmri 0.0439413342
## Residuals NA
Used hier.part package to define unique (i.e., independent) variance and joint (i.e., shared) variance in brain-PAD.
hp.res <- hier.part(non.healthy.df$brainPAD, non.healthy.df[c("age_at_scan","sex", "height_f12144_2_0", "volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0", "head_motion_tfmri")], gof = "Rsqu", barplot = F)
round(hp.res$gfs,3)
## [1] 0.000 0.007 0.002 0.000 0.001 0.054 0.008 0.007 0.008 0.055 0.006
## [12] 0.008 0.055 0.001 0.054 0.055 0.011 0.014 0.056 0.008 0.055 0.056
## [23] 0.010 0.056 0.058 0.055 0.015 0.057 0.058 0.056 0.058 0.059
round(hp.res$IJ,3)
## I
## age_at_scan 0.004
## sex 0.003
## height_f12144_2_0 0.001
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.002
## head_motion_tfmri 0.049
## J
## age_at_scan 0.003
## sex -0.001
## height_f12144_2_0 -0.001
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 -0.001
## head_motion_tfmri 0.005
## Total
## age_at_scan 0.007
## sex 0.002
## height_f12144_2_0 0.000
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.001
## head_motion_tfmri 0.054
round(hp.res$I.perc,3)
## I
## age_at_scan 6.012
## sex 5.486
## height_f12144_2_0 1.085
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 3.260
## head_motion_tfmri 84.156
P-values are corrected using FDR for 18 different comparisons.
run_lm <- function(var, data) {
m1 <- lm(brainPAD ~ data[[var]] +
poly(age_at_scan, 2, raw = F) +
sex +
height_f12144_2_0 +
volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
head_motion_tfmri,
data = data)
y <- round(summary(m1)$coefficients[2,],3)
z <- round(p.adjust(summary(m1)$coefficients[2,4], method = "fdr", n = 18),5)
names(z) <- "corrected_p"
part.eta <- round(etasq(m1, partial = T)[1,1],4)
names(part.eta) <- "partial eta^2"
return(c(y,z,part.eta))
}
run_lm("diastolic_blood_pressure_automated_reading_f4079_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0490 0.0050 9.5200 0.0000 0.0000
## partial eta^2
## 0.0074
run_lm("systolic_blood_pressure_automated_reading_f4080_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0280 0.0030 9.4330 0.0000 0.0000
## partial eta^2
## 0.0073
run_lm("body_mass_index_bmi_f21001_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.008 0.013 0.646 0.518 1.000
## partial eta^2
## 0.000
run_lm("weight_f21002_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0060 0.0050 1.4000 0.1620 1.0000
## partial eta^2
## 0.0001
run_lm("hip_circumference_f49_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.0080 0.0060 -1.3810 0.1670 1.0000
## partial eta^2
## 0.0001
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
with(subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3), table(diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0))
## diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
## -1 -3 0 1
## 0 0 13747 836
run_lm("diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0", subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 2.1150 0.2070 10.2080 0.0000 0.0000
## partial eta^2
## 0.0071
TukeyHSD(aov(brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
##
## $diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
## diff lwr upr p adj
## 1-0 3.200375 2.790681 3.610068 0
Question What was your age when the stroke was first diagnosed? recoded to be presence/absence of stroke history. Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
run_lm("stroke_history", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 2.695 0.407 6.614 0.000 0.000
## partial eta^2
## 0.003
TukeyHSD(aov(brainPAD ~ stroke_history, data = non.healthy.df))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ stroke_history, data = non.healthy.df)
##
## $stroke_history
## diff lwr upr p adj
## stroke-no stroke 3.426915 2.604718 4.249112 0
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0), table(facial_ageinguses_datacoding_100435_f1757_2_0))
## facial_ageinguses_datacoding_100435_f1757_2_0
## 1 2 3
## 10577 157 2668
run_lm("facial_ageinguses_datacoding_100435_f1757_2_0", subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.5550 0.4620 -1.2010 0.2300 1.0000
## partial eta^2
## 0.0001
with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -1 & non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -3), describeBy(brainPAD, as.factor(facial_ageinguses_datacoding_100435_f1757_2_0)))
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## X1 1 10577 0.51 5.91 0.26 0.36 5.64 -24.26 34.18 58.44 0.34
## kurtosis se
## X1 0.79 0.06
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## X1 1 157 0.23 6.15 0.19 0.3 6.18 -17.52 19.18 36.7 -0.04
## kurtosis se
## X1 0.33 0.49
## --------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## X1 1 2668 0.58 5.89 0.39 0.45 5.67 -16.43 38.27 54.7 0.38
## kurtosis se
## X1 1.09 0.11
Prefer not to answer coded as -3. These values need to be excluded.
table(non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0)
##
## -3 0 1 2
## 49 9006 5009 558
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), table(smoking_statususes_datacoding_90_f20116_2_0))
## smoking_statususes_datacoding_90_f20116_2_0
## 0 1 2
## 9006 5009 558
run_lm("smoking_statususes_datacoding_90_f20116_2_0", subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.879 0.102 8.636 0.000 0.000
## partial eta^2
## 0.007
TukeyHSD(aov(brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
##
## $smoking_statususes_datacoding_90_f20116_2_0
## diff lwr upr p adj
## 1-0 1.2244273 0.9815577 1.467297 0.0000000
## 2-0 2.0732981 1.4721869 2.674409 0.0000000
## 2-1 0.8488708 0.2339260 1.463816 0.0034851
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), describeBy(brainPAD, smoking_statususes_datacoding_90_f20116_2_0))
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew
## X1 1 9006 0.07 5.78 -0.15 -0.06 5.58 -24.26 38.27 62.53 0.34
## kurtosis se
## X1 0.9 0.06
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew
## X1 1 5009 1.29 6.03 1.02 1.15 5.78 -23.62 34.18 57.8 0.33
## kurtosis se
## X1 0.84 0.09
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 558 2.14 6.14 1.76 1.92 5.81 -14.38 28.33 42.71 0.47 0.8
## se
## X1 0.26
Prefer not to answer coded as -3. These values need to be excluded.
run_lm("alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0", subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.997 0.147 -6.776 0.000 0.000
## partial eta^2
## 0.009
TukeyHSD(aov(brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
##
## $alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0
## diff lwr upr p adj
## 2-1 -1.1541854 -1.58676824 -0.721602484 0.0000000
## 3-1 -1.5252477 -1.96057436 -1.089920992 0.0000000
## 4-1 -1.7463517 -2.27707136 -1.215632107 0.0000000
## 5-1 -1.2546140 -1.80184014 -0.707387820 0.0000000
## 6-1 -0.9950198 -1.64425031 -0.345789271 0.0001832
## 3-2 -0.3710623 -0.74594379 0.003819168 0.0542344
## 4-2 -0.5921664 -1.07454602 -0.109786722 0.0062410
## 5-2 -0.1004286 -0.60091166 0.400054423 0.9928403
## 6-2 0.1591656 -0.45118438 0.769515530 0.9764937
## 4-3 -0.2211041 -0.70594578 0.263737669 0.7853564
## 5-3 0.2706337 -0.23222279 0.773490184 0.6422536
## 6-3 0.5302279 -0.08206979 1.142525562 0.1337037
## 5-4 0.4917378 -0.09564513 1.079120632 0.1611072
## 6-4 0.7513319 0.06791227 1.434751611 0.0214300
## 6-5 0.2595942 -0.43672154 0.955909919 0.8961925
with(subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3), describeBy(brainPAD, alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0))
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2388 1.71 6.16 1.34 1.48 5.79 -21.19 29.8 51 0.41 0.66
## se
## X1 0.13
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 4081 0.55 5.72 0.34 0.46 5.4 -23.62 28.72 52.33 0.25 0.7
## se
## X1 0.09
## --------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## X1 1 3945 0.18 5.96 0.03 0.06 5.77 -24.26 38.27 62.53 0.43
## kurtosis se
## X1 1.46 0.09
## --------------------------------------------------------
## group: 4
## vars n mean sd median trimmed mad min max range skew
## X1 1 1723 -0.04 5.62 -0.3 -0.19 5.53 -16.45 22.26 38.71 0.3
## kurtosis se
## X1 0.33 0.14
## --------------------------------------------------------
## group: 5
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1554 0.45 5.99 0.23 0.3 5.6 -20.62 25.71 46.32 0.33 0.77
## se
## X1 0.15
## --------------------------------------------------------
## group: 6
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 929 0.71 5.95 0.44 0.6 5.77 -15.34 24.01 39.36 0.24 0.31
## se
## X1 0.2
Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.
run_lm("duration_of_moderate_activityuses_datacoding_100291_f894_2_0", subset(non.healthy.df, non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -3 & non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -1))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.000 0.001 0.279 0.780 1.000
## partial eta^2
## 0.000
run_lm("duration_of_vigorous_activityuses_datacoding_100291_f914_2_0", subset(non.healthy.df, non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -3 & non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -1))
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.001 0.001 -0.461 0.645 1.000
## partial eta^2
## 0.000
Tests including Fluid intelligence, Trail-making task, Matrix pattern completion, Tower rearranging.
run_lm("fluid_intelligence_score_f20016_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.1470 0.0240 -5.9980 0.0000 0.0000
## partial eta^2
## 0.0027
## Trail making
run_lm("duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.00300 0.00100 2.71200 0.00700 0.12066
## partial eta^2
## 0.00120
run_lm("duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0020 0.0000 5.6670 0.0000 0.0000
## partial eta^2
## 0.0054
## Matrix pattern completion
run_lm("number_of_puzzles_correctly_solved_f6373_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.2180 0.0370 -5.8820 0.0000 0.0000
## partial eta^2
## 0.0059
run_lm("duration_spent_answering_each_puzzle_f6333_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## 0.0100 0.0070 1.4520 0.1470 1.0000
## partial eta^2
## 0.0004
## Tower rearranging
run_lm("number_of_puzzles_correct_f6382_2_0", non.healthy.df)
## Estimate Std. Error t value Pr(>|t|) corrected_p
## -0.117 0.021 -5.468 0.000 0.000
## partial eta^2
## 0.005