Analysis of the UK Biobank neuroimaging data to build a model of brain ageing using multiple neuroimaging modalities. James Cole, 2nd October 2019. The analysis was updated to reflect further analysis in response to reviewers in March 2020.

Set up working directory, data and libraries

The data were initially read using Ken Hanscombe’s ukbtools package (ukb_df tool), which reads the .tab file, .r script and .html. This is slow process for large files, hence here the data are read and than saved as .rda files, which can be loaded more swiftly for convenience.

# clear workspace
rm(list = ls())
# Load packages
library(boot)
library(corrplot)
library(cowplot)
library(dplyr)
library(ggplot2)
library(glmnet)
library(heplots)
library(hier.part)
library(Hmisc)
library(kableExtra)
library(knitr)
library(lm.beta)
library(psych)
library(pwr)
library(tidyverse)
library(ukbtools)

Load downloaded data files, and merge

Looks for existing file and loads is available.

if (file.exists("ukb_data.rda")) {
  cat("Loading data file", date(),sep = " ")
  load(file = "ukb_data.rda")
  } else {
    tmp <- ukb_df("ukb23892", path = "/Volumes/home/analysis/UKBiobank/ID_23892")
    tmp1 <- ukb_df("ukb26571", path = "/Volumes/home/analysis/UKBiobank/ID_26571")
    tmp2 <- read.table("rs_fMRI_data.txt", colClasses = c("eid" = "character"))
    df <- list(tmp, tmp1, tmp2) %>% reduce(left_join, by = "eid") # using purr
    save(df, file = "ukb_data.rda")
    rm(list = ls(pattern = "tmp*"))
  }
## Loading data file Wed Mar  4 15:30:18 2020

Data from n = 22392 UK Biobank participants were downloaded for the study.

Subset to include those with scans only

Field 12188 uses coding 0=no, 1=yes, -1=unknown. Subset to only include those who completed brain MRI from imaging assessment.

df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 <- as.factor(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
table(df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0)
## 
##    -1     0     1 
##    49  2025 20237
df <- subset(df, df$brain_mri_measurement_completeduses_datacoding_21_f12188_2_0 == 1)

Reformat age (numeric) and sex (factor) variables

df[,grep("age_when_attended", names(df))] <- lapply(df[,grep("age_when_attended", names(df))], as.numeric)
df$sex <- factor(df$sexuses_datacoding_9_f31_0_0)
df$sex <- dplyr::recode(df$sex, "0" = "Female", "1" = "Male")
df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 <-  factor(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0)
df$stroke_history <- factor(!is.na(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) & as.numeric(df$age_stroke_diagnoseduses_datacoding_100291_f4056_2_0) > 0, labels = c("no stroke", "stroke"))

var.list <- c("body_mass_index_bmi_f21001_2_0",
              "diastolic_blood_pressure_automated_reading_f4079_2_0",
              "diastolic_blood_pressure_automated_reading_f4079_2_1",
              "systolic_blood_pressure_automated_reading_f4080_2_0",
              "systolic_blood_pressure_automated_reading_f4080_2_1",
              "hip_circumference_f49_2_0",
              "weight_f21002_2_0",
              "hand_grip_strength_left_f46_2_0",
              "hand_grip_strength_right_f47_2_0",
              "overall_health_ratinguses_datacoding_100508_f2178_2_0",
              "longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0",
              "height_f12144_2_0",
              "mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0",
              "mean_rfmri_head_motion_averaged_across_space_and_time_points_f25741_2_0",
              "volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0",
              "fluid_intelligence_score_f20016_2_0",
              "duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0",
              "duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0",
              "number_of_puzzles_correctly_solved_f6373_2_0",
              "duration_spent_answering_each_puzzle_f6333_2_0",
              "number_of_puzzles_correct_f6382_2_0",
              "duration_of_moderate_activityuses_datacoding_100291_f894_2_0",
              "duration_of_vigorous_activityuses_datacoding_100291_f914_2_0")

df[,var.list] <- lapply(df[,var.list], as.numeric)

Make list of imaging variables and recode variables to numeric

imaging.vars.list <- grep("forced|nifti|discrepancy", grep("volume|t2star|t2_flair|bold|activation|skeleton|tract|Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
df[,grep("volume", names(df))] <- lapply(df[,grep("volume", names(df))], as.numeric)
df[,grep("t2star", names(df))] <- lapply(df[,grep("t2star", names(df))], as.numeric)
df[,grep("t2_flair", names(df))] <- lapply(df[,grep("t2_flair", names(df))], as.numeric)
## Warning in lapply(df[, grep("t2_flair", names(df))], as.numeric): NAs
## introduced by coercion
df[,grep("bold", names(df))] <- lapply(df[,grep("bold", names(df))], as.numeric)
df[,grep("skeleton", names(df))] <- lapply(df[,grep("skeleton", names(df))], as.numeric)
df[,grep("tract", names(df))] <- lapply(df[,grep("tract", names(df))], as.numeric)
df[,grep("activation", names(df))] <- lapply(df[,grep("activation", names(df))], as.numeric)

The UK Biobank imaging visit is instance 2

Instance 0 is the baseline demographic clinical visit from 2006-2010. Instance 1 is a follow-up from 2012, no imaging then. Exclude people without complete imaging data.

table(complete.cases(df[,c(imaging.vars.list)]))
## 
## FALSE  TRUE 
##  2776 17461
df <- df[complete.cases(df[,c(imaging.vars.list)]),]
df$age_at_scan <- df$age_when_attended_assessment_centre_f21003_2_0
describe(df$age_at_scan)
##    vars     n  mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 17461 62.46 7.42     63   62.57 8.9  45  80    35 -0.14    -0.89
##      se
## X1 0.06
table(df$sex)
## 
## Female   Male 
##   9274   8187
ggplot(df, aes(age_at_scan)) +
  geom_histogram(binwidth = 1, fill = "darkgoldenrod2", colour = "black", lwd = 0.25) +
  xlab('Age at scan (years)') +
  theme_cowplot()

Analysis

Set up data

Select only healthy participants

Function to generate a data.frame with TRUE or FALSE for the presence/absence of ICD diagnosis. Adapted from Ken’s ukb_icd_diagnosis() function.

has_icd.diagnosis <- function(id, icd.version){
  x <- df %>% dplyr::filter(eid %in% id) %>% 
  dplyr::select(matches(paste("^diagnoses.*icd", icd.version, sep = ""))) %>% 
  dplyr::select_if(colSums(!is.na(.)) > 0) %>% ncol() != 0
  return(data.frame(id, x))
}

Compute ICD diagnosis data

Running on the whole dataset takes >4 hours for n>20,000. So load .RDA file if avaialble.

if (file.exists("ukb_icd.diagnosis_data.rda")) {
  load("ukb_icd.diagnosis_data.rda")
  } else {
    has_icd.diagnosis.df <- do.call(rbind, lapply(df$eid, function(x) has_icd.diagnosis(x, 10)))
    save(has_icd.diagnosis.df, file = "ukb_icd.diagnosis_data.rda")
  }

Merge to add health status variable.

names(has_icd.diagnosis.df) <- c("eid", "icd_positive")
df <- merge(df, has_icd.diagnosis.df)
table(df$icd_positive)
## 
## FALSE  TRUE 
##  4708 12753

Check number of ICD-positive against subjective health and longstanding illness.

table(df$overall_health_ratinguses_datacoding_100508_f2178_2_0, df$icd_positive)
##     
##      FALSE TRUE
##   -3     0    3
##   -1     3   16
##   1   1180 2141
##   2   3018 7963
##   3    436 2244
##   4     36  307
table(df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0, df$icd_positive)
##     
##      FALSE TRUE
##   -3     1   14
##   -1    40  196
##   0   3899 8723
##   1    733 3741

Make data.frame of healthy people only

Include people with no ICD-10 diagnosis, no self-reported long-standing illness disability or infirmity (F2188) and good or excellent self-reported health (F2178). Remove subjects with NAs in the age column and with the missing imaging variables.

df$healthy <- ifelse(df$icd_positive == "FALSE" & df$longstanding_illness_disability_or_infirmityuses_datacoding_100349_f2188_2_0 == 0 & df$overall_health_ratinguses_datacoding_100508_f2178_2_0 >= 2 & df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 == 0 & df$stroke_history != "stroke", "healthy", "non-healthy")
healthy.df <- subset(df, df$healthy == "healthy")
table(is.na(df$healthy))
## 
## FALSE  TRUE 
## 17426    35

Descriptive statistics of healthy vs. non-healthy

Run descriptive statistics

describeBy(df$age_at_scan, df$healthy)
## 
##  Descriptive statistics by group 
## group: healthy
##    vars    n  mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 2725 61.47 7.21     62   61.48 8.9  46  79    33 -0.02     -0.9
##      se
## X1 0.14
## -------------------------------------------------------- 
## group: non-healthy
##    vars     n  mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 14701 62.64 7.45     63   62.78 8.9  45  80    35 -0.16    -0.88
##      se
## X1 0.06
table(df$sex, df$healthy)
##         
##          healthy non-healthy
##   Female    1343        7914
##   Male      1382        6787
round(prop.table(table(df$sex, df$healthy), 2),3)
##         
##          healthy non-healthy
##   Female   0.493       0.538
##   Male     0.507       0.462
ggplot(subset(df, !is.na(df$healthy)), aes(age_at_scan, color = healthy)) +
  geom_density(aes(fill = healthy), alpha = 0.5) +
  xlab("Age at MRI scan (years)") +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1") +
  theme_cowplot() 

Ethnic background

Lots of missing data here, around 50% NAs Coding here: http://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001

table(is.na(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0))
## 
## FALSE  TRUE 
##  8563  8898
table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy)
##       
##        healthy non-healthy
##   -1         1           1
##   -3         3           9
##   1          1           1
##   1001    1236        6694
##   1002      27         188
##   1003      33         144
##   2          0           1
##   2001       2          11
##   2002       1           4
##   2003       1           7
##   2004       5          12
##   3001      11          46
##   3002       1          13
##   3003       1           0
##   3004       2          11
##   4001       1          23
##   4002       2          16
##   5          4          17
##   6          5          28
round(100*prop.table(table(df$ethnic_backgrounduses_datacoding_1001_f21000_2_0, df$healthy), 2),2)
##       
##        healthy non-healthy
##   -1      0.07        0.01
##   -3      0.22        0.12
##   1       0.07        0.01
##   1001   92.45       92.64
##   1002    2.02        2.60
##   1003    2.47        1.99
##   2       0.00        0.01
##   2001    0.15        0.15
##   2002    0.07        0.06
##   2003    0.07        0.10
##   2004    0.37        0.17
##   3001    0.82        0.64
##   3002    0.07        0.18
##   3003    0.07        0.00
##   3004    0.15        0.15
##   4001    0.07        0.32
##   4002    0.15        0.22
##   5       0.30        0.24
##   6       0.37        0.39

Anthropometrics

BMI

by(df$body_mass_index_bmi_f21001_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
##   vars    n  mean   sd median trimmed  mad  min   max range skew kurtosis
## 1    1 2663 26.44 4.16  25.94   26.12 3.74 16.4 58.04 41.65 1.05     2.77
##     se Q0.25 Q0.75
## 1 0.08 23.58 28.68
## -------------------------------------------------------- 
## df$healthy: non-healthy
##   vars     n mean   sd median trimmed  mad   min   max range skew kurtosis
## 1    1 14348 26.6 4.41  25.94   26.22 3.87 13.39 55.23 41.84 1.05     2.18
##     se Q0.25 Q0.75
## 1 0.04 23.56 28.88

Weight

by(df$weight_f21002_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
##   vars    n mean   sd median trimmed   mad  min   max range skew kurtosis
## 1    1 2665 76.3 14.8     75   75.57 14.53 41.6 179.8 138.2 0.73     1.81
##     se Q0.25 Q0.75
## 1 0.29  65.6  85.4
## -------------------------------------------------------- 
## df$healthy: non-healthy
##   vars     n  mean    sd median trimmed   mad min   max range skew
## 1    1 14382 76.14 15.13   74.6   75.15 14.68  33 169.2 136.2 0.79
##   kurtosis   se Q0.25 Q0.75
## 1     1.27 0.13  65.2    85

Hip circumference

by(df$hip_circumference_f49_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
##   vars    n   mean   sd median trimmed  mad min max range skew kurtosis
## 1    1 2667 100.98 8.19    100  100.49 7.41  80 152    72 0.83     1.92
##     se Q0.25 Q0.75
## 1 0.16  95.5   105
## -------------------------------------------------------- 
## df$healthy: non-healthy
##   vars     n   mean   sd median trimmed  mad min max range skew kurtosis
## 1    1 14400 101.39 8.75    100  100.77 7.41  68 156    88 0.97     2.36
##     se Q0.25 Q0.75
## 1 0.07    96   106

Blood pressure

print("Diastolic")
## [1] "Diastolic"
by(df$diastolic_blood_pressure_automated_reading_f4079_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
##   vars    n  mean    sd median trimmed   mad min max range skew kurtosis
## 1    1 2254 79.67 10.61     79   79.52 10.38  43 114    71 0.14    -0.03
##     se Q0.25 Q0.75
## 1 0.22    72    87
## -------------------------------------------------------- 
## df$healthy: non-healthy
##   vars     n  mean    sd median trimmed   mad min max range skew kurtosis
## 1    1 12097 78.35 10.53     78    78.1 10.38  38 128    90 0.27     0.17
##    se Q0.25 Q0.75
## 1 0.1    71    85
print("Systolic")
## [1] "Systolic"
by(df$systolic_blood_pressure_automated_reading_f4080_2_0, df$healthy, function(x) describe(x, quant = c(.25,.75), na.rm = T))
## df$healthy: healthy
##   vars    n   mean    sd median trimmed   mad min max range skew kurtosis
## 1    1 2254 139.55 18.95    139  138.86 19.27  81 212   131 0.37     0.06
##    se Q0.25  Q0.75
## 1 0.4   126 151.75
## -------------------------------------------------------- 
## df$healthy: non-healthy
##   vars     n   mean    sd median trimmed   mad min max range skew kurtosis
## 1    1 12097 138.43 18.94    137   137.7 19.27  78 228   150 0.43     0.38
##     se Q0.25 Q0.75
## 1 0.17   125   150

Diabetes

table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy)
##     
##      healthy non-healthy
##   -1       0          24
##   -3       0          15
##   0     2725       13747
##   1        0         836
round(100*prop.table(table(df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, df$healthy), 2),2)
##     
##      healthy non-healthy
##   -1    0.00        0.16
##   -3    0.00        0.10
##   0   100.00       94.02
##   1     0.00        5.72

Stroke

table(df$stroke_history, df$healthy)
##            
##             healthy non-healthy
##   no stroke    2725       14500
##   stroke          0         201
round(100*prop.table(table(df$stroke_history, df$healthy), 2),2)
##            
##             healthy non-healthy
##   no stroke  100.00       98.63
##   stroke       0.00        1.37

Designate subset as training and validation/model testing

Use 80% for training and 20% for validation/testing.

# Determine sample size and assign training/test group variables
set.seed(1982)
index <- sample(2, nrow(healthy.df), replace = TRUE, prob = c(0.8, 0.2))
table(index)
## index
##    1    2 
## 2205  520
# Split data into training/testing and keep imaging variables only
train_data <- healthy.df[index == 1, imaging.vars.list]
test_data <- healthy.df[index == 2,  imaging.vars.list]
# Define objects with age labels for training and test sets
train_labels <- healthy.df[index == 1, "age_at_scan"]
test_labels <- healthy.df[index == 2, "age_at_scan"]

Check the age distribution between training and test

describe(train_labels)
##    vars    n  mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 2205 61.54 7.24     62   61.54 8.9  46  79    33 -0.03    -0.89
##      se
## X1 0.15
describe(test_labels)
##    vars   n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 520 61.2 7.07     61    61.2 8.9  47  78    31    0    -0.97 0.31
par(mfrow = c(1,2))
hist(train_labels, breaks = 20, col = "darkgoldenrod2", main = "Training set age", xlab = "Age at scan (years)")
hist(test_labels, breaks = 20, col = "darkgoldenrod2", main = "Validation set age", xlab = "Age at scan (years)")

par(mfrow = c(1,1))

Scale variables.

This is essential for ANN and probably a good idea for all the models. #### Important, need to apply scaling parameters to new data

scaled.train_data <- scale(train_data, scale = TRUE, center = TRUE)
scaling.parameters.center <- attr(scaled.train_data, "scaled:center")
scaling.parameters.scale <- attr(scaled.train_data, "scaled:scale")
scaled.test_data <- as.data.frame(scale(test_data, scaling.parameters.center, scaling.parameters.scale))
scaled.train_data <- as.data.frame(scaled.train_data)

Univariate correlations with age

Plot of n=1079 variables using the index order (which is basically arbitrary). With n = 2725, very small r values will be significant at 0.05. Bonferroni adjusted pvalues need to be below 0.05/1079 = 4.633920310^{-5}.

alpha <- 0.05
power <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha, power = 0.8, alternative = "greater")
power.bonf <- pwr.r.test(n = length(healthy.df$age_at_scan), sig.level = alpha/length(imaging.vars.list), power = 0.8, alternative = "greater")
plot(sapply(healthy.df[imaging.vars.list], function(x) cor(x, healthy.df$age_at_scan)), 
     type = "h", 
     col = "darkgoldenrod2",
     ylab = "Pearson's r with age");abline(h = 0);abline(h = power$r, lty = "dashed", col = "grey40");abline(h = (0 - power$r), lty = "dashed", col = "grey40"); abline(h = power.bonf$r, col = "darkred", lty = 2); abline(h = (0 - power.bonf$r), col = "darkred", lty = 2)
text(x = 0, y = 0.065, "Uncorrected p = 0.05", col = "grey40", adj = 0, cex = 0.8)
text(x = 0, y = 0.13, "Bonferroni p = 0.05", col = "darkred", adj = 0, cex = 0.8)

Of the univariate correlations with age, 886 are significant at p = 0.05. When using Bonferroni correction 704 are significant at p = 0.05.

Functions to output accuracy metrics and plot age by predicted age.

Take predicted age values as input.

test_results <- function(pred) {
  r <- cor.test(test_labels, pred)$estimate
  r.sq <- summary(lm(test_labels ~ pred))$r.squared
  MAE <- mean(abs(pred - test_labels), na.rm = T)
  age.bias <- cor.test(test_labels, (pred - test_labels))$estimate
  value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
  results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
  return(kable(results) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre")))
}

age_plot <- function(pred) {
  ggplot() +
    geom_abline(slope = 1, intercept = 0) +
    geom_point(aes(x = test_labels, y = pred), shape = 21, bg = "darkgoldenrod2", size = 2) +
    geom_smooth(aes(x = test_labels, y = pred), method = "lm", col = "darkgrey") +
    labs(title = deparse(substitute(pred)), x = "Age (years)", y = "Brain-predicted age (years)") +
    theme_cowplot()
  }

LASSO regression

Using the glmnet package. Alpha = 1 is for LASSO penalisation (0 = ridge, 0.5 = elastic net).

x.train <- as.matrix(scaled.train_data)
dimnames(x.train) <- NULL
y.train <- as.matrix(train_labels)
## cross-validation for lambda
set.seed(1883)
lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train,
                          alpha = 1, family = "gaussian")

Plot results. The minimum lambda value is 0.05, while the optimal lambda value (i.e., the highest value within 1 standard error of the minimum) is 0.127.

plot(lasso.fit.cv)

LASSO model performance on validation data

Fit model using previously optimised (through CV) lambda value (1 SE value, not minimum).

lasso.fit <- glmnet(x = x.train, y = y.train,
                    alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
test_results(lasso.pred)
value
cor r 0.78
R^2 0.609
MAE 3.549
cor Age.bias -0.671
age_plot(lasso.pred)

ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/brain_age_scatterplot.pdf", useDingbats = FALSE, dpi = 75, height = 4, width = 4)

Variable weightings and feature selection results

LASSO.coefficient <- coef(lasso.fit, s = lasso.fit.cv$lambda.1se)[-1]
var.coefs <- data.frame(imaging.vars.list, LASSO.coefficient)
non.zero_vars <- subset(var.coefs, var.coefs$LASSO.coefficient != 0)
non.zero_vars$imaging.vars.list <- factor(non.zero_vars$imaging.vars.list)

Out of the original 1079 variables, the LASSO regression set 188 to non-zero, thus 891 variables were removed.

Bootstrap LASSO

Bootstrap 95% confidence intervals. Uses the boot package.

Function to obtain LASSO regression coefficients

Essential to convert coefficients to vector that stores zeros.

lasso.coef <- function(data, indices) {
  d <- data[indices,]
  fit <- glmnet(x = d[,-1], y = d[,1],
                    alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
  return(coef(fit)[,1])
}

Run bootstrap with n replications

Normal printing and plotting of results doesn’t work for high-dimensional datasets. Load data file if it already exists.

if (file.exists("lasso.boot.out.rda")) {
  load("lasso.boot.out.rda")
  cat("loading existing bootstrap file")
  } else {
    cat("running bootstraps")
    boot.out <- boot(data = cbind(y.train, x.train), statistic = lasso.coef, R = 1000)
    save(boot.out, file = "lasso.boot.out.rda")
  }
## loading existing bootstrap file

There were 221 non-zero coefficients.

Check histogram of bootstrap coefficients for top variable by way of example.

ggplot() +
  geom_histogram(bins = 100, aes(boot.out$t[,which.max(abs(boot.out$t0[-1])) + 1]),
                 fill = "darkgoldenrod2",
                 colour = "black",
                 lwd = 0.25) +
  xlab("Top variable bootstrapped coefficients") +
  theme_cowplot()

Function for getting CIs from vector

ci.vector <- function(index, boot.object, ci.type) {
  x <- boot.ci(boot.object, type = ci.type, index = index)
  return(x[4])
}

Use my ci.vector() function (defined above) to derive confidence intervals.

n <- length(boot.out$t0)
boot.ci.out <- sapply(1:n, ci.vector, boot.object = boot.out, ci.type = "basic")
x <- boot.out$t0[1:n]
y <- data.frame(t(matrix(unlist(boot.ci.out), ncol = n)))[4:5]
ci.df <- cbind(x, y)
names(ci.df) <- c("coef", "l.ci", "u.ci")

Identify variables with confidence intervals that do not overlap zero.

# drop intercept from plot using [-1] in vector ci.df$l.ci and ci.df$u.ci (i.e., the intercept is the top row)
sig.vars.index <- which(ci.df$l.ci[-1] > 0 | ci.df$u.ci[-1] < 0)
sig.vars.list <- imaging.vars.list[sig.vars.index]
sig.vars.df <- ci.df[sig.vars.index + 1,] ## add 1 to omit intercept row
sig.vars.df <- cbind(sig.vars.list, round(sig.vars.df,3))
kable(sig.vars.df[order(abs(sig.vars.df$coef), decreasing = T),]) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre"), fixed_thead = list(enabled = T, background = "lightgrey"))
sig.vars.list coef l.ci u.ci
V6 volume_of_grey_matter_normalised_for_head_size_f25005_2_0 -1.466 -1.851 -1.242
V800 weightedmean_icvf_in_tract_forceps_minor_f25661_2_0 -1.029 -1.789 -0.699
V26 volume_of_brain_stem_4th_ventricle_f25025_2_0 0.692 0.484 1.139
V164 volume_of_grey_matter_in_ventral_striatum_left_f25890_2_0 -0.666 -1.022 -0.393
V711 weightedmean_l1_in_tract_anterior_thalamic_radiation_right_f25572_2_0 0.513 0.218 1.026
V584 mean_isovf_in_fornix_on_fa_skeleton_f25445_2_0 0.501 0.219 1.003
V195 mean_fa_in_middle_cerebellar_peduncle_on_fa_skeleton_f25056_2_0 -0.477 -0.910 -0.275
V157 volume_of_grey_matter_in_putamen_right_f25883_2_0 0.460 0.262 0.911
V13 volume_of_thalamus_right_f25012_2_0 -0.454 -0.909 -0.116
V210 mean_fa_in_cerebral_peduncle_on_fa_skeleton_left_f25071_2_0 -0.453 -0.906 -0.226
V208 mean_fa_in_superior_cerebellar_peduncle_on_fa_skeleton_left_f25069_2_0 0.429 0.053 0.859
V339 mean_l1_in_middle_cerebellar_peduncle_on_fa_skeleton_f25200_2_0 -0.428 -0.815 -0.162
V463 mean_l3_in_posterior_thalamic_radiation_on_fa_skeleton_right_f25324_2_0 0.418 0.129 0.835
V426 mean_l2_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25287_2_0 0.393 0.062 0.787
V486 mean_icvf_in_body_of_corpus_callosum_on_fa_skeleton_f25347_2_0 0.381 0.031 0.762
V356 mean_l1_in_anterior_limb_of_internal_capsule_on_fa_skeleton_left_f25217_2_0 0.349 0.095 0.698
V330 mean_mo_in_fornix_cresstria_terminalis_on_fa_skeleton_left_f25191_2_0 -0.336 -0.610 -0.061
V173 volume_of_grey_matter_in_vi_cerebellum_right_f25899_2_0 -0.330 -0.661 -0.059
V137 volume_of_grey_matter_in_frontal_operculum_cortex_right_f25863_2_0 -0.321 -0.582 -0.146
V572 mean_od_in_superior_longitudinal_fasciculus_on_fa_skeleton_left_f25433_2_0 0.316 0.169 0.633
V178 volume_of_grey_matter_in_vermis_crus_ii_cerebellum_f25904_2_0 0.306 0.115 0.532
V787 weightedmean_l3_in_tract_uncinate_fasciculus_left_f25648_2_0 0.302 0.002 0.603
V16 volume_of_putamen_left_f25015_2_0 -0.284 -0.567 -0.011
V279 mean_md_in_cingulum_hippocampus_on_fa_skeleton_right_f25140_2_0 -0.272 -0.544 -0.109
V391 mean_l2_in_splenium_of_corpus_callosum_on_fa_skeleton_f25252_2_0 -0.271 -0.542 -0.014
V683 weightedmean_mo_in_tract_anterior_thalamic_radiation_left_f25544_2_0 0.263 0.065 0.526
V48 90th_percentile_of_bold_effect_in_groupdefined_mask_for_shapes_activation_f25761_2_0 0.229 0.050 0.425
V692 weightedmean_mo_in_tract_forceps_minor_f25553_2_0 -0.226 -0.452 -0.031
V31 median_t2star_in_putamen_left_f25030_2_0 -0.221 -0.442 -0.016
V153 volume_of_grey_matter_in_thalamus_right_f25879_2_0 0.217 0.005 0.435
V45 median_bold_effect_in_groupdefined_mask_for_facesshapes_contrast_f25048_2_0 -0.212 -0.423 -0.060
V714 weightedmean_l1_in_tract_parahippocampal_part_of_cingulum_left_f25575_2_0 -0.174 -0.348 -0.001
V1026 Partial_corr_25_dim_25752_2_0_v157 0.165 0.024 0.329
V421 mean_l2_in_cingulum_cingulate_gyrus_on_fa_skeleton_right_f25282_2_0 -0.119 -0.237 -0.009
## sort dataset by coefficient
ci.df2 <- ci.df[order(ci.df$coef, decreasing = T),]
# drop intercept from plot using [-1,] in data.frame ci.df (i.e., the intercept is the top row)
plot(ci.df2[-1,1], ylim = c(min(ci.df2[-1,2]), max(ci.df2[-1,3])),
     pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient") + 
  arrows(x0 = 1:(n - 1), y0 = ci.df2[-1,2], y1 = ci.df2[-1,3],
         length = 0.02, angle = 90, code = 3, col = "grey") +
  abline(h = 0, type = 2)

## integer(0)

Plot only the significant variables

There are 34 variables with CIs that don’t overlap zero.

## sort dataset by coefficient
sig.vars.df2 <- sig.vars.df[order(sig.vars.df$coef, decreasing = T),]
opar <- par() 
par(mar = c(15, 4, 1, 2))
axis_labels <- gsub("_f....._2_0", "", sig.vars.df2[,1])
plot(sig.vars.df2$coef, ylim = c(min(sig.vars.df2$l.ci),max(sig.vars.df2$u.ci)),
     pch = 20, col = "darkgoldenrod2", ylab = "LASSO coefficient", xaxt = "n", xlab = "") + 
  arrows(x0 = 1:dim(sig.vars.df2)[1], y0 = sig.vars.df2$l.ci, y1 = sig.vars.df2$u.ci,
         length = 0.02, angle = 90, code = 3, col = "grey") +
  abline(h = 0, type = 2)
## integer(0)
axis(side = 1, at = 1:length(sig.vars.list), labels = axis_labels, las = 2, cex.axis = 0.8)

par(opar)

Run model with only significant variables

Top variables OLS

top.ols <- lm(train_labels ~ .,
          data = scaled.train_data[,sig.vars.index])
top.ols.pred <- predict(object = top.ols, newdata = scaled.test_data[,sig.vars.index])

test_results(top.ols.pred)
value
cor r 0.734
R^2 0.538
MAE 3.817
cor Age.bias -0.59
age_plot(top.ols.pred)

Top variables LASSO

## fit model using optimal lambda value (1 SE value, not minimum)
set.seed(1883)
top.lasso.fit <- glmnet(x = x.train[,sig.vars.index], y = y.train,
                    alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
top.lasso.pred <- predict(top.lasso.fit, newx = as.matrix(scaled.test_data[,sig.vars.index]))
test_results(top.lasso.pred)
value
cor r 0.735
R^2 0.54
MAE 3.792
cor Age.bias -0.677
age_plot(top.lasso.pred)

Modality specific analysis

Function to run all LASSO analyses

Using the same training/test split as above.

run_lasso <- function(list) {
  set.seed(1883)
  train_data <- healthy.df[index == 1, list]
  test_data <- healthy.df[index == 2,  list]
  scaled.train_data <- as.data.frame(scale(train_data))
  scaled.test_data <- as.data.frame(scale(test_data))
  x.train <- as.matrix(scaled.train_data)
  dimnames(x.train) <- NULL
  y.train <- as.matrix(train_labels)
  lasso.fit.cv <- cv.glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian") ## cross-validation for lambda
  lasso.fit <- glmnet(x = x.train, y = y.train, alpha = 1, family = "gaussian", lambda = lasso.fit.cv$lambda.1se)
  lasso.pred <- predict(lasso.fit, newx = as.matrix(scaled.test_data))
  dimnames(lasso.pred)[[2]] <- deparse(substitute(list))
  return(lasso.pred)
}

T1

t1.vars.list <- grep("forced|nifti|discrepancy|t2_flair", grep("volume", names(df), value = T), invert = T, value = T)
t1.pred <- run_lasso(t1.vars.list)
test_results(t1.pred)
value
cor r 0.684
R^2 0.468
MAE 4.14
cor Age.bias -0.721

T2

Only one variable, just use OLS regression.

t2.ols <- lm(train_labels ~ total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0,
          data = scaled.train_data)
t2.ols.pred <- predict(object = t2.ols, newdata = scaled.test_data)
test_results(t2.ols.pred)
value
cor r 0.308
R^2 0.095
MAE 5.653
cor Age.bias -0.942

T2-star

t2star.vars.list <- grep("forced|nifti|discrepancy", grep("t2star", names(df), value = T), invert = T, value = T)
t2star.pred <- run_lasso(t2star.vars.list)
test_results(t2star.pred)
value
cor r 0.329
R^2 0.108
MAE 5.78
cor Age.bias -0.987

Diffusion

diffusion.vars.list <- grep("forced|nifti|discrepancy", grep("skeleton|tract", names(df), value = T), invert = T, value = T)
diffusion.pred <- run_lasso(diffusion.vars.list)
test_results(diffusion.pred)
value
cor r 0.73
R^2 0.533
MAE 3.897
cor Age.bias -0.638

Task fMRI

task.vars.list <- grep("forced|nifti|discrepancy", grep("bold|activation", names(df), value = T), invert = T, value = T)
task.pred <- run_lasso(task.vars.list)
test_results(task.pred)
value
cor r 0.161
R^2 0.026
MAE 5.929
cor Age.bias -0.986

Resting state fMRI

rest.vars.list <- grep("forced|nifti|discrepancy", grep("Partial_corr_25_dim", names(df), value = T), invert = T, value = T)
rest.pred <- run_lasso(rest.vars.list)
test_results(rest.pred)
value
cor r 0.44
R^2 0.194
MAE 5.261
cor Age.bias -0.921

Correlations across modalities

pred.mat <- cbind(test_labels, t1.pred, t2.ols.pred, t2star.pred, diffusion.pred, task.pred, rest.pred)
colnames(pred.mat) <- c("Age", "T1-weighted","T2-FLAIR","T2*","Diffusion","Task fMRI","Resting-state fMRI")
# col1 <- colorRampPalette(brewer.pal(n = 10, name = "RdBu"))
col1 <- colorRampPalette(colors = c("firebrick", "white", "dodgerblue"))
cairo_pdf(filename = "~/Work/Articles/Brain age/UK Biobank multi-modal brain age/variable_corrplot.pdf")
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
         addCoef.col = "black", tl.col = "black")
dev.off()
## quartz_off_screen 
##                 2
corrplot(cor(pred.mat), type = "upper", diag = T, method = "color", col = col1(100),
         addCoef.col = "black", tl.col = "black")

Leave-one-modality-out analysis

Not T1

not.t1.list <- c("total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0",  t2star.vars.list, diffusion.vars.list, task.vars.list, rest.vars.list)
not.t1.pred <- run_lasso(not.t1.list)
test_results(not.t1.pred)
value
cor r 0.751
R^2 0.565
MAE 3.752
cor Age.bias -0.654

Not T2

not.t2.list <- c(t1.vars.list,  t2star.vars.list, diffusion.vars.list, task.vars.list, rest.vars.list)
not.t2.pred <- run_lasso(not.t2.list)
test_results(not.t2.pred)
value
cor r 0.778
R^2 0.605
MAE 3.572
cor Age.bias -0.642

Not T2*

not.t2star.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", diffusion.vars.list, task.vars.list, rest.vars.list)
not.t1.pred <- run_lasso(not.t2star.list)
test_results(not.t1.pred)
value
cor r 0.773
R^2 0.598
MAE 3.598
cor Age.bias -0.641

Not diffusion

not.diffusion.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0", t2star.vars.list, task.vars.list, rest.vars.list)
not.diffusion.pred <- run_lasso(not.diffusion.list)
test_results(not.diffusion.pred)
value
cor r 0.715
R^2 0.511
MAE 3.975
cor Age.bias -0.692

Not task

not.task.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0",  t2star.vars.list, diffusion.vars.list, rest.vars.list)
not.task.pred <- run_lasso(not.task.list)
test_results(not.task.pred)
value
cor r 0.781
R^2 0.61
MAE 3.569
cor Age.bias -0.634

Not resting-state

not.rest.list <- c(t1.vars.list, "total_volume_of_white_matter_hyperintensities_from_t1_and_t2_flair_images_f25781_2_0",  t2star.vars.list, diffusion.vars.list, task.vars.list)
not.rest.pred <- run_lasso(not.rest.list)
test_results(not.rest.pred)
value
cor r 0.779
R^2 0.607
MAE 3.522
cor Age.bias -0.615

Testing on non-healthy people

Define non-healthy people as testing set.

non.healthy.df <- subset(df, df$healthy == "non-healthy")
table(complete.cases(non.healthy.df[,imaging.vars.list]))
## 
##  TRUE 
## 14701
kable(describe(non.healthy.df$age_at_scan), digits = 2) %>% kable_styling(bootstrap_options = c("striped", "condensed"))
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 14701 62.64 7.45 63 62.78 8.9 45 80 35 -0.16 -0.88 0.06
describeBy(non.healthy.df$age_at_scan, non.healthy.df$sex)
## 
##  Descriptive statistics by group 
## group: Female
##    vars    n  mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 7914 61.87 7.33     62   61.93 8.9  45  80    35 -0.07    -0.89
##      se
## X1 0.08
## -------------------------------------------------------- 
## group: Male
##    vars    n  mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 6787 63.54 7.49     65    63.8 7.41  45  80    35 -0.29    -0.82
##      se
## X1 0.09

Scale new subjects variables using the scaling parameters from the training set.

scaled.non.healthy.test <- as.data.frame(scale(non.healthy.df[,imaging.vars.list], scaling.parameters.center, scaling.parameters.scale))
lasso.non.healthy.pred <- as.numeric(predict(lasso.fit, newx = as.matrix(scaled.non.healthy.test)))

Evaluate multi-modality brain-age model performance

non.healthy.labels <- non.healthy.df$age_at_scan
test_results2 <- function(pred, labels) {
  r <- cor.test(labels, pred)$estimate
  r.sq <- summary(lm(labels ~ pred))$r.squared
  MAE <- mean(abs(pred - labels), na.rm = T)
  age.bias <- cor.test(labels, (pred - labels))$estimate
  value <- sapply(c(r,r.sq, MAE, age.bias), function(x) round(x, 3))
  results <- cbind(c("r", "R^2", "MAE", "Age.bias"), value)
  return(kable(results) %>% kable_styling(bootstrap_options = c("striped","condensed", "responsive", full_width = F, position = "centre")))
}
test_results2(lasso.non.healthy.pred, non.healthy.labels)
value
cor r 0.803
R^2 0.644
MAE 3.555
cor Age.bias -0.64

Brain-predicted age plot

ggplot() + 
  geom_abline(slope = 1, intercept = 0) +
  geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha = 0.25) +
  geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), method = "lm", col = "darkgrey") +
  xlab("Age (years)") +
  ylab("Brain-predicted age (years)") +
  theme_cowplot()

Correct for age bias

Calculate age bias in initial test data.

bias.model <- lm(lasso.pred ~ test_labels)
bias.model$coefficients[1]
## (Intercept) 
##    25.72573
bias.model$coefficients[2]
## test_labels 
##   0.5797221

Apply correction to new (non-healthy) test data. Subtract the intercept and then divide by the slope

lasso.non.healthy.pred.corrected <- (lasso.non.healthy.pred - bias.model$coefficients[1]) / bias.model$coefficients[2]
test_results2(lasso.non.healthy.pred.corrected, non.healthy.labels)
value
cor r 0.803
R^2 0.644
MAE 4.631
cor Age.bias 0.083

Age-bias corrected plot

ggplot() + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha  = 0.25) +
  geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), method = "lm", col = "darkgrey") +
  xlab("Age (years)") +
  ylab("Brain-predicted age (years)") +
  theme_cowplot()

Save plot for paper

Using cowplot package

plot1 <- ggplot() +
  geom_abline(slope = 1, intercept = 0, lty = 2) +
  geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), shape = 16, col = "darkgoldenrod2", size = 1.5, alpha  = 0.25) +
  geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred), method = "lm", col = "darkgrey") +
  xlab("Age (years)") +
  ylab("Brain-predicted age (years)") +
  theme_cowplot()
plot2 <- ggplot() +
  geom_abline(slope = 1, intercept = 0, lty = 2) +
  geom_point(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), shape = 16, col = "#0ebdee", size = 1.5, alpha  = 0.25) +
  geom_smooth(aes(x = non.healthy.labels, y = lasso.non.healthy.pred.corrected), method = "lm", col = "darkgrey") +
  xlab("Age (years)") +
  ylab("Bias-adjusted age (years)") +
  theme_cowplot()
plot_grid(plot1, plot2, ncol = 2, labels = "AUTO")

ggsave("~/Work/Articles/Brain age/UK Biobank multi-modal brain age/bias_correction_scatterplot.pdf", plot_grid(plot1, plot2, ncol = 2, labels = "AUTO"), useDingbats = FALSE, dpi = 100, height = 4, width = 8)

Brain-PAD: descriptive stats

Define brain-PAD and look at descriptives

non.healthy.df$brainPAD <- lasso.non.healthy.pred.corrected - non.healthy.df$age_at_scan
kable(describe(non.healthy.df$brainPAD), digits = 2) %>% kable_styling(bootstrap_options = c("striped", "condensed"))
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 14701 0.58 5.92 0.33 0.44 5.66 -24.26 38.27 62.53 0.35 0.87 0.05
hist(non.healthy.df$brainPAD, breaks = 25, col = "darkgoldenrod2", xlab = "Brain-PAD (years)")

Brain-PAD: evaluate potential covariates

Decide what to use as covariates for brain-PAD

non.healthy.df$head_motion_tfmri <- non.healthy.df$mean_tfmri_head_motion_averaged_across_space_and_time_points_f25742_2_0
m1 <- (lm(brainPAD ~ poly(age_at_scan, 2, raw = F) + 
                    sex + 
                    height_f12144_2_0 +
                    volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
                    head_motion_tfmri,
                  data = non.healthy.df))
summary(m1)
## 
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex + 
##     height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 + 
##     head_motion_tfmri, data = non.healthy.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.102  -3.802  -0.206   3.577  35.068 
## 
## Coefficients:
##                                                                     Estimate
## (Intercept)                                                        -3.654316
## poly(age_at_scan, 2, raw = F)1                                     17.099857
## poly(age_at_scan, 2, raw = F)2                                     59.098200
## sexMale                                                             0.929932
## height_f12144_2_0                                                  -0.016317
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0  2.893481
## head_motion_tfmri                                                  19.543429
##                                                                    Std. Error
## (Intercept)                                                          1.649942
## poly(age_at_scan, 2, raw = F)1                                       6.030984
## poly(age_at_scan, 2, raw = F)2                                       5.731581
## sexMale                                                              0.148847
## height_f12144_2_0                                                    0.007846
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0   0.519682
## head_motion_tfmri                                                    0.752032
##                                                                    t value
## (Intercept)                                                         -2.215
## poly(age_at_scan, 2, raw = F)1                                       2.835
## poly(age_at_scan, 2, raw = F)2                                      10.311
## sexMale                                                              6.248
## height_f12144_2_0                                                   -2.080
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0   5.568
## head_motion_tfmri                                                   25.987
##                                                                    Pr(>|t|)
## (Intercept)                                                         0.02679
## poly(age_at_scan, 2, raw = F)1                                      0.00458
## poly(age_at_scan, 2, raw = F)2                                      < 2e-16
## sexMale                                                            4.28e-10
## height_f12144_2_0                                                   0.03757
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 2.63e-08
## head_motion_tfmri                                                   < 2e-16
##                                                                       
## (Intercept)                                                        *  
## poly(age_at_scan, 2, raw = F)1                                     ** 
## poly(age_at_scan, 2, raw = F)2                                     ***
## sexMale                                                            ***
## height_f12144_2_0                                                  *  
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 ***
## head_motion_tfmri                                                  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.724 on 14694 degrees of freedom
## Multiple R-squared:  0.0654, Adjusted R-squared:  0.06502 
## F-statistic: 171.4 on 6 and 14694 DF,  p-value: < 2.2e-16
lm.beta(m1)
## 
## Call:
## lm(formula = brainPAD ~ poly(age_at_scan, 2, raw = F) + sex + 
##     height_f12144_2_0 + volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 + 
##     head_motion_tfmri, data = non.healthy.df)
## 
## Standardized Coefficients::
##                                                        (Intercept) 
##                                                         0.00000000 
##                                     poly(age_at_scan, 2, raw = F)1 
##                                                         0.02382672 
##                                     poly(age_at_scan, 2, raw = F)2 
##                                                         0.08234668 
##                                                            sexMale 
##                                                         0.07832250 
##                                                  height_f12144_2_0 
##                                                        -0.02577438 
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 
##                                                         0.05963084 
##                                                  head_motion_tfmri 
##                                                         0.21564003
etasq(m1, partial = T)
##                                                                    Partial eta^2
## poly(age_at_scan, 2, raw = F)                                       0.0076814828
## sex                                                                 0.0026492783
## height_f12144_2_0                                                   0.0002942559
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0  0.0021052835
## head_motion_tfmri                                                   0.0439413342
## Residuals                                                                     NA

Hierachical partitioning of variance of covariates

Used hier.part package to define unique (i.e., independent) variance and joint (i.e., shared) variance in brain-PAD.

hp.res <- hier.part(non.healthy.df$brainPAD, non.healthy.df[c("age_at_scan","sex", "height_f12144_2_0", "volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0", "head_motion_tfmri")], gof = "Rsqu", barplot = F)
round(hp.res$gfs,3)
##  [1] 0.000 0.007 0.002 0.000 0.001 0.054 0.008 0.007 0.008 0.055 0.006
## [12] 0.008 0.055 0.001 0.054 0.055 0.011 0.014 0.056 0.008 0.055 0.056
## [23] 0.010 0.056 0.058 0.055 0.015 0.057 0.058 0.056 0.058 0.059
round(hp.res$IJ,3)
##                                                                        I
## age_at_scan                                                        0.004
## sex                                                                0.003
## height_f12144_2_0                                                  0.001
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.002
## head_motion_tfmri                                                  0.049
##                                                                         J
## age_at_scan                                                         0.003
## sex                                                                -0.001
## height_f12144_2_0                                                  -0.001
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 -0.001
## head_motion_tfmri                                                   0.005
##                                                                    Total
## age_at_scan                                                        0.007
## sex                                                                0.002
## height_f12144_2_0                                                  0.000
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 0.001
## head_motion_tfmri                                                  0.054
round(hp.res$I.perc,3)
##                                                                         I
## age_at_scan                                                         6.012
## sex                                                                 5.486
## height_f12144_2_0                                                   1.085
## volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0  3.260
## head_motion_tfmri                                                  84.156

Barplot of unique and shared variance

barplot(t(as.matrix(hp.res$IJ[,1:2])), col = c("darkgoldenrod2", "dodgerblue"), names.arg = c("Age", "Sex", "Height", "WMH volume", "Head motion"), xlab = "Variables", ylab = "Variance explained (R^2) in  brain-PAD")

Define function to run linear regression with required covariates

P-values are corrected using FDR for 18 different comparisons.

run_lm <- function(var, data) {
  m1 <- lm(brainPAD ~ data[[var]] +
    poly(age_at_scan, 2, raw = F) +
    sex +
    height_f12144_2_0 +
    volumetric_scaling_from_t1_head_image_to_standard_space_f25000_2_0 +
    head_motion_tfmri,
  data = data)
  y <- round(summary(m1)$coefficients[2,],3)
  z <- round(p.adjust(summary(m1)$coefficients[2,4], method = "fdr", n = 18),5)
  names(z) <- "corrected_p"
  part.eta <- round(etasq(m1, partial = T)[1,1],4)
  names(part.eta) <- "partial eta^2"
  return(c(y,z,part.eta))
}

Health parameters

Cardiac

run_lm("diastolic_blood_pressure_automated_reading_f4079_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        0.0490        0.0050        9.5200        0.0000        0.0000 
## partial eta^2 
##        0.0074
run_lm("systolic_blood_pressure_automated_reading_f4080_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        0.0280        0.0030        9.4330        0.0000        0.0000 
## partial eta^2 
##        0.0073

Obesity

run_lm("body_mass_index_bmi_f21001_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##         0.008         0.013         0.646         0.518         1.000 
## partial eta^2 
##         0.000
run_lm("weight_f21002_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        0.0060        0.0050        1.4000        0.1620        1.0000 
## partial eta^2 
##        0.0001
run_lm("hip_circumference_f49_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##       -0.0080        0.0060       -1.3810        0.1670        1.0000 
## partial eta^2 
##        0.0001

Diabetes

Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.

with(subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3), table(diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0))
## diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
##    -1    -3     0     1 
##     0     0 13747   836
run_lm("diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0", subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        2.1150        0.2070       10.2080        0.0000        0.0000 
## partial eta^2 
##        0.0071
TukeyHSD(aov(brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3)))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = brainPAD ~ diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0, data = subset(non.healthy.df, non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -1 & non.healthy.df$diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0 != -3))
## 
## $diabetes_diagnosed_by_doctoruses_datacoding_100349_f2443_2_0
##         diff      lwr      upr p adj
## 1-0 3.200375 2.790681 3.610068     0

Stroke

Question What was your age when the stroke was first diagnosed? recoded to be presence/absence of stroke history. Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.

run_lm("stroke_history", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##         2.695         0.407         6.614         0.000         0.000 
## partial eta^2 
##         0.003
TukeyHSD(aov(brainPAD ~ stroke_history, data = non.healthy.df))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = brainPAD ~ stroke_history, data = non.healthy.df)
## 
## $stroke_history
##                      diff      lwr      upr p adj
## stroke-no stroke 3.426915 2.604718 4.249112     0

Facial ageing

Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.

with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0), table(facial_ageinguses_datacoding_100435_f1757_2_0))
## facial_ageinguses_datacoding_100435_f1757_2_0
##     1     2     3 
## 10577   157  2668
run_lm("facial_ageinguses_datacoding_100435_f1757_2_0", subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 > 0))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##       -0.5550        0.4620       -1.2010        0.2300        1.0000 
## partial eta^2 
##        0.0001
with(subset(non.healthy.df, non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -1 & non.healthy.df$facial_ageinguses_datacoding_100435_f1757_2_0 != -3), describeBy(brainPAD, as.factor(facial_ageinguses_datacoding_100435_f1757_2_0)))
## 
##  Descriptive statistics by group 
## group: 1
##    vars     n mean   sd median trimmed  mad    min   max range skew
## X1    1 10577 0.51 5.91   0.26    0.36 5.64 -24.26 34.18 58.44 0.34
##    kurtosis   se
## X1     0.79 0.06
## -------------------------------------------------------- 
## group: 2
##    vars   n mean   sd median trimmed  mad    min   max range  skew
## X1    1 157 0.23 6.15   0.19     0.3 6.18 -17.52 19.18  36.7 -0.04
##    kurtosis   se
## X1     0.33 0.49
## -------------------------------------------------------- 
## group: 3
##    vars    n mean   sd median trimmed  mad    min   max range skew
## X1    1 2668 0.58 5.89   0.39    0.45 5.67 -16.43 38.27  54.7 0.38
##    kurtosis   se
## X1     1.09 0.11

Smoking

Prefer not to answer coded as -3. These values need to be excluded.

table(non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0)
## 
##   -3    0    1    2 
##   49 9006 5009  558
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), table(smoking_statususes_datacoding_90_f20116_2_0))
## smoking_statususes_datacoding_90_f20116_2_0
##    0    1    2 
## 9006 5009  558
run_lm("smoking_statususes_datacoding_90_f20116_2_0", subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##         0.879         0.102         8.636         0.000         0.000 
## partial eta^2 
##         0.007
TukeyHSD(aov(brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3)))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = brainPAD ~ smoking_statususes_datacoding_90_f20116_2_0, data = subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3))
## 
## $smoking_statususes_datacoding_90_f20116_2_0
##          diff       lwr      upr     p adj
## 1-0 1.2244273 0.9815577 1.467297 0.0000000
## 2-0 2.0732981 1.4721869 2.674409 0.0000000
## 2-1 0.8488708 0.2339260 1.463816 0.0034851
with(subset(non.healthy.df, non.healthy.df$smoking_statususes_datacoding_90_f20116_2_0 != -3), describeBy(brainPAD, smoking_statususes_datacoding_90_f20116_2_0))
## 
##  Descriptive statistics by group 
## group: 0
##    vars    n mean   sd median trimmed  mad    min   max range skew
## X1    1 9006 0.07 5.78  -0.15   -0.06 5.58 -24.26 38.27 62.53 0.34
##    kurtosis   se
## X1      0.9 0.06
## -------------------------------------------------------- 
## group: 1
##    vars    n mean   sd median trimmed  mad    min   max range skew
## X1    1 5009 1.29 6.03   1.02    1.15 5.78 -23.62 34.18  57.8 0.33
##    kurtosis   se
## X1     0.84 0.09
## -------------------------------------------------------- 
## group: 2
##    vars   n mean   sd median trimmed  mad    min   max range skew kurtosis
## X1    1 558 2.14 6.14   1.76    1.92 5.81 -14.38 28.33 42.71 0.47      0.8
##      se
## X1 0.26

Alcohol

Prefer not to answer coded as -3. These values need to be excluded.

run_lm("alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0", subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        -0.997         0.147        -6.776         0.000         0.000 
## partial eta^2 
##         0.009
TukeyHSD(aov(brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3)))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = brainPAD ~ alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0, data = subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3))
## 
## $alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0
##           diff         lwr          upr     p adj
## 2-1 -1.1541854 -1.58676824 -0.721602484 0.0000000
## 3-1 -1.5252477 -1.96057436 -1.089920992 0.0000000
## 4-1 -1.7463517 -2.27707136 -1.215632107 0.0000000
## 5-1 -1.2546140 -1.80184014 -0.707387820 0.0000000
## 6-1 -0.9950198 -1.64425031 -0.345789271 0.0001832
## 3-2 -0.3710623 -0.74594379  0.003819168 0.0542344
## 4-2 -0.5921664 -1.07454602 -0.109786722 0.0062410
## 5-2 -0.1004286 -0.60091166  0.400054423 0.9928403
## 6-2  0.1591656 -0.45118438  0.769515530 0.9764937
## 4-3 -0.2211041 -0.70594578  0.263737669 0.7853564
## 5-3  0.2706337 -0.23222279  0.773490184 0.6422536
## 6-3  0.5302279 -0.08206979  1.142525562 0.1337037
## 5-4  0.4917378 -0.09564513  1.079120632 0.1611072
## 6-4  0.7513319  0.06791227  1.434751611 0.0214300
## 6-5  0.2595942 -0.43672154  0.955909919 0.8961925
with(subset(non.healthy.df, non.healthy.df$alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0 != -3), describeBy(brainPAD, alcohol_intake_frequencyuses_datacoding_100402_f1558_2_0))
## 
##  Descriptive statistics by group 
## group: 1
##    vars    n mean   sd median trimmed  mad    min  max range skew kurtosis
## X1    1 2388 1.71 6.16   1.34    1.48 5.79 -21.19 29.8    51 0.41     0.66
##      se
## X1 0.13
## -------------------------------------------------------- 
## group: 2
##    vars    n mean   sd median trimmed mad    min   max range skew kurtosis
## X1    1 4081 0.55 5.72   0.34    0.46 5.4 -23.62 28.72 52.33 0.25      0.7
##      se
## X1 0.09
## -------------------------------------------------------- 
## group: 3
##    vars    n mean   sd median trimmed  mad    min   max range skew
## X1    1 3945 0.18 5.96   0.03    0.06 5.77 -24.26 38.27 62.53 0.43
##    kurtosis   se
## X1     1.46 0.09
## -------------------------------------------------------- 
## group: 4
##    vars    n  mean   sd median trimmed  mad    min   max range skew
## X1    1 1723 -0.04 5.62   -0.3   -0.19 5.53 -16.45 22.26 38.71  0.3
##    kurtosis   se
## X1     0.33 0.14
## -------------------------------------------------------- 
## group: 5
##    vars    n mean   sd median trimmed mad    min   max range skew kurtosis
## X1    1 1554 0.45 5.99   0.23     0.3 5.6 -20.62 25.71 46.32 0.33     0.77
##      se
## X1 0.15
## -------------------------------------------------------- 
## group: 6
##    vars   n mean   sd median trimmed  mad    min   max range skew kurtosis
## X1    1 929 0.71 5.95   0.44     0.6 5.77 -15.34 24.01 39.36 0.24     0.31
##     se
## X1 0.2

Physical activity

Moderate activity

Do not know coded as -1, Prefer not to answer coded as -3. These values need to be excluded.

run_lm("duration_of_moderate_activityuses_datacoding_100291_f894_2_0", subset(non.healthy.df, non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -3 & non.healthy.df$duration_of_moderate_activityuses_datacoding_100291_f894_2_0 != -1))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##         0.000         0.001         0.279         0.780         1.000 
## partial eta^2 
##         0.000

Vigorous activity

run_lm("duration_of_vigorous_activityuses_datacoding_100291_f914_2_0", subset(non.healthy.df, non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -3 & non.healthy.df$duration_of_vigorous_activityuses_datacoding_100291_f914_2_0 != -1))
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        -0.001         0.001        -0.461         0.645         1.000 
## partial eta^2 
##         0.000

Cognitive performance

Tests including Fluid intelligence, Trail-making task, Matrix pattern completion, Tower rearranging.

run_lm("fluid_intelligence_score_f20016_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##       -0.1470        0.0240       -5.9980        0.0000        0.0000 
## partial eta^2 
##        0.0027
## Trail making
run_lm("duration_to_complete_numeric_path_trail_1uses_datacoding_1990_f6348_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##       0.00300       0.00100       2.71200       0.00700       0.12066 
## partial eta^2 
##       0.00120
run_lm("duration_to_complete_alphanumeric_path_trail_2uses_datacoding_1990_f6350_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        0.0020        0.0000        5.6670        0.0000        0.0000 
## partial eta^2 
##        0.0054
## Matrix pattern completion
run_lm("number_of_puzzles_correctly_solved_f6373_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##       -0.2180        0.0370       -5.8820        0.0000        0.0000 
## partial eta^2 
##        0.0059
run_lm("duration_spent_answering_each_puzzle_f6333_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        0.0100        0.0070        1.4520        0.1470        1.0000 
## partial eta^2 
##        0.0004
## Tower rearranging
run_lm("number_of_puzzles_correct_f6382_2_0", non.healthy.df)
##      Estimate    Std. Error       t value      Pr(>|t|)   corrected_p 
##        -0.117         0.021        -5.468         0.000         0.000 
## partial eta^2 
##         0.005