Read in necessary libraries, functions and look up tables
library(igraph)
library(qgraph)
library(casnet)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(psych)
library(lme4)
library(reshape2)
library(lubridate)
library(parallel)
library(wesanderson)
library(stringr)
library(matrixStats)
library(ggthemes)
library(gridExtra)
library(cowplot)
# save the original plotting space parameters
og_par <- par()
Read in data
Prepare the Outcome measures
for (timepoint in c('m00', 'm03', 'm06', 'm09', 'm12')) {
aumc_df[, paste0(timepoint, "_NHPT_D_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_D1"), paste0(timepoint,"_NHPT_D2"))], na.rm=T)
aumc_df[, paste0(timepoint, "_NHPT_ND_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_ND1"), paste0(timepoint,"_NHPT_ND2"))], na.rm=T)
aumc_df[, paste0(timepoint, "_NHPT_BH_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_D_avg"), paste0(timepoint,"_NHPT_ND_avg"))], na.rm=T)
aumc_df[, paste0(timepoint, "_TWT_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_TWT_1"), paste0(timepoint,"_TWT_2"))], na.rm=T)
}
Get a list of all the outcome measures of interest at each time point
outcomes_of_interest_ls <- c()
for (outcome in outcomes_list) {
outcomes_of_interest_ls <- append(outcomes_of_interest_ls, colnames(aumc_df %>% select(matches(outcome))))
}
Features used in the creation of the complexity measures (taken RQA python notebook)
# features with highest rqa measures (most interesting NLTSA wise)
feature_selection <- c('post_correction_slowing_MEAN_2CD',
'pre_correction_slowing_MEAN_2CD',
'hold_time_STD',
'flight_time_MEDIAN',
'flight_time_MEAN_ABS_CHANGE',
'hold_time_SKEW',
'correction_duration_MEAN_2CD',
'post_correction_slowing_P_AUTO_CORR',
'after_punctuation_pause_MEAN_CHANGE',
'after_punctuation_pause_MEAN_2CD',
'session_duration_MEAN_2CD',
'after_punctuation_pause_MEDIAN',
'flight_time_MAX')
feature_selection
[1] "post_correction_slowing_MEAN_2CD" "pre_correction_slowing_MEAN_2CD" "hold_time_STD"
[4] "flight_time_MEDIAN" "flight_time_MEAN_ABS_CHANGE" "hold_time_SKEW"
[7] "correction_duration_MEAN_2CD" "post_correction_slowing_P_AUTO_CORR" "after_punctuation_pause_MEAN_CHANGE"
[10] "after_punctuation_pause_MEAN_2CD" "session_duration_MEAN_2CD" "after_punctuation_pause_MEDIAN"
[13] "flight_time_MAX"
Identify the different groups (change in MRI, no change in MRI and HC users) and prep the data for the analysis
mri_cols <- colnames(aumc_df %>% select(matches("mri_enhancing")))
diff_df <- NA
for (i in seq(1, length(mri_cols)-1)) {
diff_df <- cbind(diff_df, aumc_df[mri_cols[i+1]] - aumc_df[mri_cols[i]])
}
diff_df <- diff_df[, 2:ncol(diff_df)]
diff_df <- diff_df[rowSums(is.na(diff_df)) < 3, ]
## SPECIFY THE INDICES YOU WISH TO INVESTIGATE
row_amount <- 58
# user indices which have a change in mri
mri_change_index <- as.integer(names(which(rowSums(abs(diff_df), na.rm=TRUE) > 0)))
mri_change_ls <- aumc_df$user_id[mri_change_index]
short_mri_change_ls <- c()
for (user in mri_change_ls) {
if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
short_mri_change_ls <- c(short_mri_change_ls, user)
}
}
mri_change_index <- which(aumc_df$user_id %in% short_mri_change_ls)
# user indices which don't have a change in mri
mri_no_change_index <- as.integer(names(which(rowSums(abs(diff_df), na.rm=TRUE) == 0)))
mri_no_change_ls <- aumc_df$user_id[mri_no_change_index]
short_mri_no_change_ls <- c()
for (user in mri_no_change_ls) {
if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
short_mri_no_change_ls <- c(short_mri_no_change_ls, user)
}
}
mri_no_change_index <- which(aumc_df$user_id %in% short_mri_no_change_ls)
# healthy controls
hc_users_index <- which(aumc_df$diagnosis_ms == 0)
hc_users_ls <- aumc_df$user_id[hc_users_index]
short_hc_users_ls <- c()
for (user in hc_users_ls) {
if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
short_hc_users_ls <- c(short_hc_users_ls, user)
}
}
hc_users_index <- which(aumc_df$user_id %in% short_hc_users_ls)
list_of_user_index_lists=c(data.frame(mri_change=mri_change_index),
data.frame(mri_no_change=mri_no_change_index),
data.frame(hc_users=hc_users_index))
prepped_dfs_list <- list()
for (users_list_number in 1:length(list_of_user_index_lists)){
user_list_name<-names(list_of_user_index_lists[users_list_number])
users_selection_index<-list_of_user_index_lists[[user_list_name]]
print(user_list_name)
# use the chosen index to get a list of users
user_selection <- aumc_df$user_id[users_selection_index]
print(user_selection)
# create a subset of the aumc_df which includes only the users of interest
missing_df <- data.frame("sample size" = colSums(!is.na(aumc_df[aumc_df$diagnosis_ms == 1, c("user_id", mri_cols, visitdate_cols)])))
# subset relevant users' keystroke data
key_sub_df <- key_df[key_df$user_id %in% user_selection, ]
# order based on user id and timestamp
key_sub_df <- key_sub_df[order(key_sub_df$user_id, key_sub_df$timestamp), ]
## Creating a data frame of the complexity measures per user
## Writing code which will parallelize the EWS calculations
dfs_list <- df_user_list(df = key_sub_df, features = c("user_id", "timestamp", feature_selection),
user_column_name = "user_id", users = user_selection)
complexity_dfs_list <- mclapply(dfs_list, EWS_calc) # mclapply is the parallelized version of lapply
# filter only the data frames in the list (drop where there was not enough data)
complexity_dfs_list <- Filter(function(x) is.data.frame(x)[[1]] > 0, complexity_dfs_list)
complex_df <- rbind.fill(complexity_dfs_list)
# merge the complex df back to the keystroke df
key_complex_df <- merge(key_sub_df[, c("user_id", "timestamp", feature_selection)],
complex_df, by = c("user_id", "timestamp"))
# relabel the 10's to 1's in the complexity_peaks factor to avoid issues later
key_complex_df$complexity_peaks[key_complex_df$complexity_peaks == 10] <- 1
# aumc_outcome_selector <- mri_cols
aumc_outcome_selector <- outcomes_of_interest_ls
# create a melted version of the aumc_df with only the variables of interest
aumc_df_m_1 <- melt(aumc_df[c("user_id", aumc_outcome_selector)], id.vars="user_id",
variable.name = "PRO_name", value.name = "PRO_value")
aumc_df[c(date_cols)] <- apply(aumc_df[c(date_cols)], 2, as.character)
aumc_df_m_2 <- melt(aumc_df[c("user_id", date_cols)], id.vars="user_id",
variable.name = "visitdate_cat", value.name = "date")
aumc_df_m_2$date <- as.Date(aumc_df_m_2$date)
aumc_df_m_1 <- aumc_df_m_1 %>% mutate(time_code = substr(PRO_name,1L,4L))
aumc_df_m_2 <- aumc_df_m_2 %>% mutate(time_code = substr(visitdate_cat,1L,4L))
aumc_df_m <- merge(aumc_df_m_1, aumc_df_m_2, by = c("user_id", "time_code"))
# select the outcome measures and their corresponding questionnaire/ visitdates then reassign
aumc_df_m_vs <- aumc_df_m %>%
filter(str_detect(aumc_df_m$PRO_name, paste(visitdate_outcomes, collapse = "|")) &
str_detect(aumc_df_m$visitdate_cat, paste(visitdate_cols, collapse = "|")))
aumc_df_m_qs <- aumc_df_m %>%
filter(str_detect(aumc_df_m$PRO_name, paste(questionnaire_date_outcomes, collapse = "|")) &
str_detect(aumc_df_m$visitdate_cat, paste(questionnare_date_cols, collapse = "|")))
aumc_df_m <- rbind(aumc_df_m_vs, aumc_df_m_qs)
# merge the melted aumc_df with the key_complex_df
key_complex_df$date <- as.Date(key_complex_df$timestamp)
df.mod <- merge(key_complex_df, aumc_df_m, by = c("user_id", "date"), all.x=TRUE, all.y=FALSE)
prepped_dfs_list[[user_list_name]] <- df.mod
}
[1] "mri_change"
[1] 368 377 380 383 389 393 398 409 410 424 433 438 444
[1] "mri_no_change"
[1] 364 365 367 369 382 386 387 388 390 396 400 406 408 411 413 416 429 430 435 449
[1] "hc_users"
[1] 372 373 376 381 384 391 402 403 404 405 412 415 421 423 425 427 431 432 437 439 448 463
Plot the Dynamic Complexity, Cumulative Complexity Peaks, and the corresponding data from the Gd enhancing lesions and Patient Reported Outcome Measures
user_failed_list <- c()
for (users_list_number in 1:length(list_of_user_index_lists)){
user_list_name<-names(list_of_user_index_lists[users_list_number])
users_selection_index<-list_of_user_index_lists[[user_list_name]]
print(user_list_name)
prepped_df <- prepped_dfs_list[[user_list_name]]
# use the chosen index to get a list of users
user_selection <- aumc_df$user_id[users_selection_index]
for (user in user_selection) {
user_dc_df <- prepped_df[prepped_df$user_id == user, ]
sub_aumc_df_m <- aumc_df_m[aumc_df_m$user_id == user, ]
temp <- aumc_df_m[aumc_df_m$user_id %in% users_selection_index, ]
selected_outcomes_list <- c()
for (i in 1:length(outcomes_list)) {
diff_list <- c()
current_outcome_vector <- na.omit(sub_aumc_df_m[grep(outcomes_list[i], sub_aumc_df_m$PRO_name), "PRO_value"])
for (j in 2: length(current_outcome_vector)-1) {
diff_list <- append(diff_list, current_outcome_vector[j + 1] - current_outcome_vector[j])
}
if (outcomes_list[i] %in% c("NHPT_ND_avg", "NHPT_D_avg", "TWT_avg")){
if (any(abs(diff_list) > diff_list[1] * outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"], na.rm=T)){
selected_outcomes_list <- append(selected_outcomes_list, outcomes_list[i])
}
} else{
if (any(abs(diff_list) >= outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"], na.rm=T)){
selected_outcomes_list <- append(selected_outcomes_list, outcomes_list[i])
}
}
}
clin_matches <- unique(grep(paste(selected_outcomes_list,collapse="|"),
sub_aumc_df_m$PRO_name, value=TRUE))
clin_sub_aumc_df_m <- sub_aumc_df_m
clin_sub_aumc_df_m[clin_sub_aumc_df_m$PRO_name %notin% clin_matches, "PRO_value"] <- NA
clin_sub_aumc_df_m$m_date <- str_split_fixed(clin_sub_aumc_df_m$PRO_name, "_", 2)[,1]
clin_sub_aumc_df_m$m_date <- factor(clin_sub_aumc_df_m$m_date)
clin_sub_aumc_df_m$PRO_group <- str_split_fixed(clin_sub_aumc_df_m$PRO_name, "_", 2)[,2]
# Two scaling options - 1) scale within each outcome measure
# sort data frame on PRO_group and PRO_name otherwise merge of scaled column will be incorrect
clin_sub_aumc_df_m <- clin_sub_aumc_df_m[order(clin_sub_aumc_df_m$PRO_group, clin_sub_aumc_df_m$PRO_name), ]
tst <- split(clin_sub_aumc_df_m$PRO_value, clin_sub_aumc_df_m$PRO_group)
tst_ls <- lapply(tst, elascer)
clin_sub_aumc_df_m<-cbind(clin_sub_aumc_df_m, data.frame(PRO_value_scaled=unlist(tst_ls)))
# Two scaling options - 2) scale all outcome measures without grouping prior
# clin_sub_aumc_df_m$PRO_value_scaled <- elascer(clin_sub_aumc_df_m$PRO_value)
user_dc_df$CCP_dates <- user_dc_df$date
if (all(is.na(user_dc_df$complexity_peaks))) {
print(paste("For user ==", user, "there were ZERO complexity peaks"))
user_failed_list <- c(user_failed_list, user)
}else{
user_dc_df[user_dc_df$complexity_peaks < 0.5, "CCP_dates"] <- NA
}
# make sure the outcomes_list and the user outcomes data frame are sorted the same way otherwise stuff will not match correctly
clin_sub_aumc_df_m <- clin_sub_aumc_df_m[order(clin_sub_aumc_df_m$PRO_group, clin_sub_aumc_df_m$PRO_name), ]
outcomes_list <- sort(outcomes_list)
clinically_relevant_bool_col <- c()
for (i in 1:length(outcomes_list)) {
diff_list <- c()
current_outcome_vector <- clin_sub_aumc_df_m[grep(outcomes_list[i], clin_sub_aumc_df_m$PRO_name), "PRO_value"]
for (j in 2: length(current_outcome_vector)-1) {
diff_list <- append(diff_list, current_outcome_vector[j + 1] - current_outcome_vector[j])
}
if (outcomes_list[i] %in% c("NHPT_ND_avg", "NHPT_D_avg", "TWT_avg")){
clinically_relevant_bool <- abs(diff_list) > diff_list[1] * outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"]
clinically_relevant_bool <- c(FALSE, clinically_relevant_bool)
for (j in 2: length(clinically_relevant_bool)) {
if (clinically_relevant_bool[j] & !is.na(clinically_relevant_bool[j])) {
clinically_relevant_bool[j - 1] <- TRUE
}
}
clinically_relevant_bool_col <- append(clinically_relevant_bool_col, clinically_relevant_bool)
} else{
clinically_relevant_bool <- abs(diff_list) >= outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"]
clinically_relevant_bool <- c(FALSE, clinically_relevant_bool)
for (j in 2: length(clinically_relevant_bool)) {
if (clinically_relevant_bool[j] & !is.na(clinically_relevant_bool[j])) {
clinically_relevant_bool[j - 1] <- TRUE
}
}
clinically_relevant_bool_col <- append(clinically_relevant_bool_col, clinically_relevant_bool)
}
}
clin_sub_aumc_df_m$clinically_relevant_boolean <- clinically_relevant_bool_col
# tiff(paste0("/home/james/Data_Science/data-science-project/research-papers/2020-NLTSA/latex/Images/", user, "_main_results.tiff"), units="in", width=10, height=4, res=400)
# png(paste0("/home/james/Data_Science/data-science-project/research-papers/2020-NLTSA/latex/Images/", user, "_main_results.png"), units="in", width=10, height=4, res=400)
date_selection <- unique(clin_sub_aumc_df_m[as.character(clin_sub_aumc_df_m$visitdate_cat) %in% questionnare_date_cols, "date"])
plot <- ggplot(clin_sub_aumc_df_m) +
geom_point(data=clin_sub_aumc_df_m[clin_sub_aumc_df_m$clinically_relevant_boolean, ],
aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group)) +
geom_line(data=clin_sub_aumc_df_m[clin_sub_aumc_df_m$clinically_relevant_boolean, ],
aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group#, size = PRO_group
)) +
geom_point(data=na.omit(clin_sub_aumc_df_m[!clin_sub_aumc_df_m$clinically_relevant_boolean, ]),
aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group), alpha=0.1) +
geom_line(data=na.omit(clin_sub_aumc_df_m[!clin_sub_aumc_df_m$clinically_relevant_boolean, ]),
aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group), alpha=0.1) +
geom_line(data = user_dc_df, aes(date, dynamic_complexity_sum), size=0.5) +
geom_point(data = user_dc_df, aes(date, dynamic_complexity_sum), shape = 16, size=2) +
xlim(min(aumc_df_m[aumc_df_m$user_id == user, "date"], na.rm = TRUE),
max(aumc_df_m[aumc_df_m$user_id == user, "date"], na.rm = TRUE)) +
ylim(0, max(elascer(user_dc_df[user_dc_df$user_id == user, "PRO_value"]), na.rm = TRUE)) +
geom_vline(xintercept = user_dc_df[user_dc_df$complexity_peaks > 0, "date"], col="red", lty=2) +
scale_x_date(breaks = date_selection,
labels = c("m00", "m002", "m03", "m06", "m09", "m12")[1:length(date_selection)]) +
labs(x = "Date", y = "Measure Change (Scaled)") +
scale_color_discrete(name="Outcome measure") +
scale_color_manual("Outcome Measures",
breaks = names(custom_cmap),
values = custom_cmap,
) +
# scale_size_manual(values = c(rep(0.3, 4), 1.5, rep(0.3, 7))) +
theme_minimal() +
# theme(axis.text.x = element_text(angle = 0), legend.title = element_blank())
theme(axis.text.x = element_text(angle = 60, hjust = 0.9, size=10),
axis.text.y = element_text(size=10), legend.title = element_blank(),
plot.title = element_text(hjust = 0.5)) +
ggtitle(paste("Change in time series data for user ==", user))
print(plot)
ggsave(paste0("/home/james/Data_Science/data-science-project/research-papers/2020-NLTSA/latex/Images/", user, "_main_results.eps"))
# dev.off()
}
}
[1] "mri_change"
[1] "mri_no_change"
[1] "hc_users"
[1] "For user == 381 there were ZERO complexity peaks"
[1] "For user == 391 there were ZERO complexity peaks"
[1] "For user == 403 there were ZERO complexity peaks"
[1] "For user == 404 there were ZERO complexity peaks"
[1] "For user == 405 there were ZERO complexity peaks"
[1] "For user == 423 there were ZERO complexity peaks"
[1] "For user == 439 there were ZERO complexity peaks"
[1] "For user == 448 there were ZERO complexity peaks"
Show the missingness within each user and per group
for (users_list_number in 1:length(list_of_user_index_lists)) {
user_list_name<-names(list_of_user_index_lists[users_list_number])
users_selection_index<-list_of_user_index_lists[[user_list_name]]
print(user_list_name)
# use the chosen index to get a list of users
user_selection <- aumc_df$user_id[users_selection_index]
## Show missingness percentages
all_missing_per_user <- data.frame()
for (user in user_selection) {
current_user_missing <- data.frame(user_id = user, t(colMeans(is.na(key_df[key_df$user_id == user, feature_selection]))*100))
all_missing_per_user <- rbind(all_missing_per_user, current_user_missing)
}
print("All percentage missingness per user per feature")
print(all_missing_per_user)
print("Median percentage missingness per user")
median_percent_miss <- data.frame(user_id = user_selection, median_percentage_missing = rowMedians(as.matrix(all_missing_per_user[, feature_selection])))
print(median_percent_miss)
print("Median of the Median percentage missingness per user")
print(median(median_percent_miss$median_percentage_missing, na.rm=TRUE))
cat("\n")
}
[1] "mri_change"
[1] "All percentage missingness per user per feature"
[1] "Median percentage missingness per user"
[1] "Median of the Median percentage missingness per user"
[1] 0.536193
[1] "mri_no_change"
[1] "All percentage missingness per user per feature"
[1] "Median percentage missingness per user"
[1] "Median of the Median percentage missingness per user"
[1] 2.036246
[1] "hc_users"
[1] "All percentage missingness per user per feature"
[1] "Median percentage missingness per user"
[1] "Median of the Median percentage missingness per user"
[1] 1.011029
Plot the CRD and CCP of an example user
---
title: "Supplemental material: Early-Warning Signals for Disease Activity in Patients Diagnosed with Multiple Sclerosis based on Keystroke Dynamics"
subtitle: "The following notebook is the construction of complexity measures and the final results relative to __subsection III.C__ and __section IV__. "
author: 
- "James Twose"
- "james@neurocast.nl"
output:
  html_notebook:
    code_folding: hide
---

#### Read in necessary libraries, functions and look up tables

```{r echo=TRUE, warning=FALSE, message=FALSE}
library(igraph)
library(qgraph)
library(casnet)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(psych)
library(lme4)
library(reshape2)
library(lubridate)
library(parallel)
library(wesanderson)
library(stringr)
library(matrixStats)
library(ggthemes)
library(gridExtra)
library(cowplot)

# save the original plotting space parameters
og_par <- par()
```

##### Show the session information
```{r}
devtools::session_info()
```


```{r echo=FALSE, warning=FALSE, message=FALSE}
# read in helpful functions and lists
source("~/Data_Science/data-science-project/research-papers/2020-NLTSA/code/aumc_extras.R")
high_cor_features <- read.csv("~/Data_Science/data-science-project/research-papers/2020-AUmc-longitudinal-analysis/code/useful_scripts/selected_features_with_outcome_df.csv",
                              row.names=1)
```

#### Read in data

```{r, echo=FALSE, warning=FALSE, message=FALSE}

#READ IN DATA====
AWS_KEY<-set_env_AWS("~/Data_Science/data-science-project/")
# Keystroke data
AWS_data<-get_data_AWS_neuro_vm_shared(your_AWS_folder="James/", 
                                       gen_data_set="Vumc_users_363_500_day_20180821_20200313_NEW_AGGs.csv")
key_df <- AWS_data$AWS_data

# ID data
AWS_data<-get_data_AWS_neuro_vm_shared(your_AWS_folder="VUmc/", 
                                       gen_data_set="vumc_code_neurokeys_code_user_id.csv")
id_df <- AWS_data$AWS_data

# AUmc data
AWS_data<-get_data_AWS_neuro_vumc_ka_hoo_private(your_AWS_folder="", 
                                       gen_data_set="20200513_APPSMS_export_DS_Friendly.csv", 
                                       sep=";", row.names=NULL)
aumc_df <- AWS_data$AWS_data

# merge user_id with the aumc_df
aumc_df <- merge(id_df[, -c(1, 2)], aumc_df, by.x="VUmc.code", by.y="VUmc_code")[, -c(1)]

# rename and order aumc_df on user_id
names(aumc_df)[names(aumc_df) == 'Neurokeys.ID'] <- 'user_id'
aumc_df <- aumc_df[order(aumc_df$user_id), ]

# merge sets the merge column to be row names, reset this following reorder
rownames(aumc_df) <- 1:nrow(aumc_df)

# rename stray visit date column
names(aumc_df)[names(aumc_df) == 'm00_visit_date'] <- 'm00_visitdate'

# replace default values with NA
aumc_df[aumc_df == "1-1-2999"] <- NA
aumc_df[aumc_df < -94] <- NA

# convert the timestamp and date columns to a human readable format
# key_df$timestamp <- as.POSIXct((key_df$timestamp + key_df$timeZoneOffset)/1000, format = "%Y-%m-%d", tz="utc", origin = "1970-01-01")
key_df$timestamp <- as.POSIXct(key_df$timestamp/1000 + key_df$timeZoneOffset, format = "%Y-%m-%d", tz="utc", origin = "1970-01-01")
# key_df$timestamp <- key_df$timestamp + key_df$timeZoneOffset
date_cols <- colnames(aumc_df %>% select(matches("date")))
visitdate_cols <- colnames(aumc_df %>% select(matches("visitdate")))
questionnare_date_cols <- colnames(aumc_df %>% select(matches("Questionnaires_comp")))

aumc_df[date_cols] <- apply(aumc_df[date_cols], 2, as.POSIXlt, format = "%d-%m-%Y", tz="utc", origin = "1970-01-01")


```

#### Prepare the Outcome measures

```{r, echo=TRUE}
for (timepoint in c('m00', 'm03', 'm06', 'm09', 'm12')) {
  aumc_df[, paste0(timepoint, "_NHPT_D_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_D1"), paste0(timepoint,"_NHPT_D2"))], na.rm=T)
  aumc_df[, paste0(timepoint, "_NHPT_ND_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_ND1"), paste0(timepoint,"_NHPT_ND2"))], na.rm=T)
  aumc_df[, paste0(timepoint, "_NHPT_BH_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_NHPT_D_avg"), paste0(timepoint,"_NHPT_ND_avg"))], na.rm=T)
  aumc_df[, paste0(timepoint, "_TWT_avg")] <- rowMeans(aumc_df[, c(paste0(timepoint, "_TWT_1"), paste0(timepoint,"_TWT_2"))], na.rm=T)
}

```

#### Get a list of all the outcome measures of interest at each time point

```{r}
outcomes_of_interest_ls <- c()
for (outcome in outcomes_list) {
  outcomes_of_interest_ls <- append(outcomes_of_interest_ls, colnames(aumc_df %>% select(matches(outcome))))
}

```

#### Features used in the creation of the complexity measures (taken RQA python notebook)

```{r}
# features with highest rqa measures (most interesting NLTSA wise)
feature_selection <- c('post_correction_slowing_MEAN_2CD',
 'pre_correction_slowing_MEAN_2CD',
 'hold_time_STD',
 'flight_time_MEDIAN',
 'flight_time_MEAN_ABS_CHANGE',
 'hold_time_SKEW',
 'correction_duration_MEAN_2CD',
 'post_correction_slowing_P_AUTO_CORR',
 'after_punctuation_pause_MEAN_CHANGE',
 'after_punctuation_pause_MEAN_2CD',
 'session_duration_MEAN_2CD',
 'after_punctuation_pause_MEDIAN',
 'flight_time_MAX')

feature_selection

```

#### Identify the different groups (change in MRI, no change in MRI and HC users) and prep the data for the analysis

```{r, warning=FALSE, message=FALSE}
mri_cols <- colnames(aumc_df %>% select(matches("mri_enhancing")))

diff_df <- NA
for (i in seq(1, length(mri_cols)-1)) {
  diff_df <- cbind(diff_df, aumc_df[mri_cols[i+1]] - aumc_df[mri_cols[i]])
}
diff_df <- diff_df[, 2:ncol(diff_df)]
diff_df <- diff_df[rowSums(is.na(diff_df)) < 3, ]

## SPECIFY THE INDICES YOU WISH TO INVESTIGATE
row_amount <- 58
# user indices which have a change in mri
mri_change_index <- as.integer(names(which(rowSums(abs(diff_df), na.rm=TRUE) > 0)))
mri_change_ls <- aumc_df$user_id[mri_change_index]
short_mri_change_ls <- c()
for (user in mri_change_ls) {
  if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
    short_mri_change_ls <- c(short_mri_change_ls, user)
  }
}
mri_change_index <- which(aumc_df$user_id %in% short_mri_change_ls)
# user indices which don't have a change in mri
mri_no_change_index <- as.integer(names(which(rowSums(abs(diff_df), na.rm=TRUE) == 0)))
mri_no_change_ls <- aumc_df$user_id[mri_no_change_index]
short_mri_no_change_ls <- c()
for (user in mri_no_change_ls) {
  if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
    short_mri_no_change_ls <- c(short_mri_no_change_ls, user)
  }
}
mri_no_change_index <- which(aumc_df$user_id %in% short_mri_no_change_ls)
# healthy controls
hc_users_index <- which(aumc_df$diagnosis_ms == 0)
hc_users_ls <- aumc_df$user_id[hc_users_index]
short_hc_users_ls <- c()
for (user in hc_users_ls) {
  if (nrow(key_df[key_df$user_id == user, ]) > row_amount) {
    short_hc_users_ls <- c(short_hc_users_ls, user)
  }
}
hc_users_index <- which(aumc_df$user_id %in% short_hc_users_ls)

list_of_user_index_lists=c(data.frame(mri_change=mri_change_index),
           data.frame(mri_no_change=mri_no_change_index),
           data.frame(hc_users=hc_users_index))

prepped_dfs_list <- list()
for (users_list_number in 1:length(list_of_user_index_lists)){
  user_list_name<-names(list_of_user_index_lists[users_list_number])
  users_selection_index<-list_of_user_index_lists[[user_list_name]]
  
  print(user_list_name)
  
  # use the chosen index to get a list of users
  user_selection <- aumc_df$user_id[users_selection_index]
  
  print(user_selection)
  
  # create a subset of the aumc_df which includes only the users of interest
  missing_df <- data.frame("sample size" = colSums(!is.na(aumc_df[aumc_df$diagnosis_ms == 1, c("user_id", mri_cols, visitdate_cols)])))
  
  # subset relevant users' keystroke data
  key_sub_df <- key_df[key_df$user_id %in% user_selection, ]
  # order based on user id and timestamp
  key_sub_df <- key_sub_df[order(key_sub_df$user_id, key_sub_df$timestamp), ]
  
  
  ## Creating a data frame of the complexity measures per user

  ## Writing code which will parallelize the EWS calculations
  dfs_list <- df_user_list(df = key_sub_df, features = c("user_id", "timestamp", feature_selection), 
                           user_column_name = "user_id", users = user_selection)
  complexity_dfs_list <- mclapply(dfs_list, EWS_calc) # mclapply is the parallelized version of lapply
  
  # filter only the data frames in the list (drop where there was not enough data)
  complexity_dfs_list <- Filter(function(x) is.data.frame(x)[[1]] > 0, complexity_dfs_list)
  
  complex_df <- rbind.fill(complexity_dfs_list)

  # merge the complex df back to the keystroke df
  key_complex_df <- merge(key_sub_df[, c("user_id", "timestamp", feature_selection)], 
                          complex_df, by = c("user_id", "timestamp")) 
  # relabel the 10's to 1's in the complexity_peaks factor to avoid issues later
  key_complex_df$complexity_peaks[key_complex_df$complexity_peaks == 10] <- 1
  
  # aumc_outcome_selector <- mri_cols
  aumc_outcome_selector <- outcomes_of_interest_ls
  # create a melted version of the aumc_df with only the variables of interest
  aumc_df_m_1 <- melt(aumc_df[c("user_id", aumc_outcome_selector)], id.vars="user_id",
                      variable.name = "PRO_name", value.name = "PRO_value")
  aumc_df[c(date_cols)] <- apply(aumc_df[c(date_cols)], 2, as.character)
  aumc_df_m_2 <- melt(aumc_df[c("user_id", date_cols)], id.vars="user_id", 
                      variable.name = "visitdate_cat", value.name = "date")
  aumc_df_m_2$date <- as.Date(aumc_df_m_2$date)
  
  aumc_df_m_1 <- aumc_df_m_1 %>% mutate(time_code = substr(PRO_name,1L,4L))
  aumc_df_m_2 <- aumc_df_m_2 %>% mutate(time_code = substr(visitdate_cat,1L,4L))
  aumc_df_m <- merge(aumc_df_m_1, aumc_df_m_2, by = c("user_id", "time_code"))
  
  # select the outcome measures and their corresponding questionnaire/ visitdates then reassign
  aumc_df_m_vs <-  aumc_df_m %>% 
    filter(str_detect(aumc_df_m$PRO_name, paste(visitdate_outcomes, collapse = "|")) & 
             str_detect(aumc_df_m$visitdate_cat, paste(visitdate_cols, collapse = "|")))
  
  aumc_df_m_qs <-  aumc_df_m %>% 
    filter(str_detect(aumc_df_m$PRO_name, paste(questionnaire_date_outcomes, collapse = "|")) & 
             str_detect(aumc_df_m$visitdate_cat, paste(questionnare_date_cols, collapse = "|")))
  aumc_df_m <- rbind(aumc_df_m_vs, aumc_df_m_qs)
  
  # merge the melted aumc_df with the key_complex_df
  key_complex_df$date <- as.Date(key_complex_df$timestamp)
  df.mod <- merge(key_complex_df, aumc_df_m, by = c("user_id", "date"), all.x=TRUE, all.y=FALSE)
  
  prepped_dfs_list[[user_list_name]] <- df.mod
  
}


```

#### Plot the Dynamic Complexity, Cumulative Complexity Peaks, and the corresponding data from the Gd enhancing lesions and Patient Reported Outcome Measures

```{r fig.width=10, fig.height=4, message=FALSE, warning=FALSE}

user_failed_list <- c()
for (users_list_number in 1:length(list_of_user_index_lists)){
  user_list_name<-names(list_of_user_index_lists[users_list_number])
  users_selection_index<-list_of_user_index_lists[[user_list_name]]
  
  print(user_list_name)
  
  prepped_df <- prepped_dfs_list[[user_list_name]]
    
  # use the chosen index to get a list of users
  user_selection <- aumc_df$user_id[users_selection_index]
  
  for (user in user_selection) {

    user_dc_df <- prepped_df[prepped_df$user_id == user, ]
    sub_aumc_df_m <- aumc_df_m[aumc_df_m$user_id == user, ]
    temp <- aumc_df_m[aumc_df_m$user_id %in% users_selection_index, ]
    
    selected_outcomes_list <- c()
    for (i in 1:length(outcomes_list)) {
      diff_list <- c()
      current_outcome_vector <- na.omit(sub_aumc_df_m[grep(outcomes_list[i], sub_aumc_df_m$PRO_name), "PRO_value"])
      for (j in 2: length(current_outcome_vector)-1) {
        diff_list <- append(diff_list, current_outcome_vector[j + 1] - current_outcome_vector[j])
      }
      if (outcomes_list[i] %in% c("NHPT_ND_avg", "NHPT_D_avg", "TWT_avg")){
        if (any(abs(diff_list) > diff_list[1] * outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"], na.rm=T)){
          selected_outcomes_list <- append(selected_outcomes_list, outcomes_list[i])
        }
    } else{
          if (any(abs(diff_list) >= outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"], na.rm=T)){
          selected_outcomes_list <- append(selected_outcomes_list, outcomes_list[i])
        }
      }
    }
    
    clin_matches <- unique(grep(paste(selected_outcomes_list,collapse="|"),
                            sub_aumc_df_m$PRO_name, value=TRUE))
    
    clin_sub_aumc_df_m <- sub_aumc_df_m
    clin_sub_aumc_df_m[clin_sub_aumc_df_m$PRO_name %notin% clin_matches, "PRO_value"] <- NA
    
    clin_sub_aumc_df_m$m_date <- str_split_fixed(clin_sub_aumc_df_m$PRO_name, "_", 2)[,1]
    clin_sub_aumc_df_m$m_date <- factor(clin_sub_aumc_df_m$m_date)
    clin_sub_aumc_df_m$PRO_group <- str_split_fixed(clin_sub_aumc_df_m$PRO_name, "_", 2)[,2]
    
    # Two scaling options - 1) scale within each outcome measure
    # sort data frame on PRO_group and PRO_name otherwise merge of scaled column will be incorrect
    clin_sub_aumc_df_m <- clin_sub_aumc_df_m[order(clin_sub_aumc_df_m$PRO_group, clin_sub_aumc_df_m$PRO_name), ]
    tst <- split(clin_sub_aumc_df_m$PRO_value, clin_sub_aumc_df_m$PRO_group)
    tst_ls <- lapply(tst, elascer)
    clin_sub_aumc_df_m<-cbind(clin_sub_aumc_df_m,  data.frame(PRO_value_scaled=unlist(tst_ls)))
    
    # Two scaling options - 2) scale all outcome measures without grouping prior
    # clin_sub_aumc_df_m$PRO_value_scaled <- elascer(clin_sub_aumc_df_m$PRO_value)
    
    user_dc_df$CCP_dates <- user_dc_df$date
    if (all(is.na(user_dc_df$complexity_peaks))) {
      print(paste("For user ==", user, "there were ZERO complexity peaks"))
      user_failed_list <- c(user_failed_list, user)
    }else{
      user_dc_df[user_dc_df$complexity_peaks < 0.5, "CCP_dates"] <- NA
      }
      
      # make sure the outcomes_list and the user outcomes data frame are sorted the same way otherwise stuff will not match correctly
      clin_sub_aumc_df_m <- clin_sub_aumc_df_m[order(clin_sub_aumc_df_m$PRO_group, clin_sub_aumc_df_m$PRO_name), ]
      outcomes_list <- sort(outcomes_list)
      clinically_relevant_bool_col <- c()
      for (i in 1:length(outcomes_list)) {
        diff_list <- c()
        current_outcome_vector <- clin_sub_aumc_df_m[grep(outcomes_list[i], clin_sub_aumc_df_m$PRO_name), "PRO_value"]
        for (j in 2: length(current_outcome_vector)-1) {
          diff_list <- append(diff_list, current_outcome_vector[j + 1] - current_outcome_vector[j])
        }
        
        if (outcomes_list[i] %in% c("NHPT_ND_avg", "NHPT_D_avg", "TWT_avg")){
          clinically_relevant_bool <- abs(diff_list) > diff_list[1] * outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"]
          
          clinically_relevant_bool <- c(FALSE, clinically_relevant_bool)
          for (j in 2: length(clinically_relevant_bool)) {
            if (clinically_relevant_bool[j] & !is.na(clinically_relevant_bool[j])) {
              clinically_relevant_bool[j - 1] <- TRUE
            }
          }
          clinically_relevant_bool_col <- append(clinically_relevant_bool_col, clinically_relevant_bool)
          
      } else{
          clinically_relevant_bool <- abs(diff_list) >= outcome_plus_cutoffs_df[outcome_plus_cutoffs_df$outcome == outcomes_list[i], "cutoffs"]
          
          clinically_relevant_bool <- c(FALSE, clinically_relevant_bool)
          for (j in 2: length(clinically_relevant_bool)) {
            if (clinically_relevant_bool[j] & !is.na(clinically_relevant_bool[j])) {
              clinically_relevant_bool[j - 1] <- TRUE
            }
          }
          clinically_relevant_bool_col <- append(clinically_relevant_bool_col, clinically_relevant_bool)
        }
      }
      
      clin_sub_aumc_df_m$clinically_relevant_boolean <- clinically_relevant_bool_col
      
      date_selection <- unique(clin_sub_aumc_df_m[as.character(clin_sub_aumc_df_m$visitdate_cat) %in% questionnare_date_cols, "date"])
      plot <- ggplot(clin_sub_aumc_df_m) +
        geom_point(data=clin_sub_aumc_df_m[clin_sub_aumc_df_m$clinically_relevant_boolean, ],
                   aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group)) +
        geom_line(data=clin_sub_aumc_df_m[clin_sub_aumc_df_m$clinically_relevant_boolean, ],
                   aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group#, size = PRO_group
                       )) +
        geom_point(data=na.omit(clin_sub_aumc_df_m[!clin_sub_aumc_df_m$clinically_relevant_boolean, ]),
                   aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group), alpha=0.1) +
        geom_line(data=na.omit(clin_sub_aumc_df_m[!clin_sub_aumc_df_m$clinically_relevant_boolean, ]),
                   aes(x=date, y=PRO_value_scaled, color=PRO_group, group = PRO_group), alpha=0.1) +
        geom_line(data = user_dc_df, aes(date, dynamic_complexity_sum), size=0.5) +
        geom_point(data = user_dc_df, aes(date, dynamic_complexity_sum), shape = 16, size=2) +
  
        xlim(min(aumc_df_m[aumc_df_m$user_id == user, "date"], na.rm = TRUE),
        max(aumc_df_m[aumc_df_m$user_id == user, "date"], na.rm = TRUE)) +
        ylim(0, max(elascer(user_dc_df[user_dc_df$user_id == user, "PRO_value"]), na.rm = TRUE)) +
        geom_vline(xintercept = user_dc_df[user_dc_df$complexity_peaks > 0, "date"], col="red", lty=2) +
        scale_x_date(breaks = date_selection, 
                     labels = c("m00", "m002", "m03", "m06", "m09", "m12")[1:length(date_selection)]) +
  
  
        labs(x = "Date", y = "Measure Change (Scaled)") +
        scale_color_discrete(name="Outcome measure") +
        scale_color_manual("Outcome Measures",
                            breaks = names(custom_cmap),
                            values = custom_cmap,
                            ) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 60, hjust = 0.9, size=10),
              axis.text.y = element_text(size=10), legend.title = element_blank(),
              plot.title = element_text(hjust = 0.5)) +
        ggtitle(paste("Change in time series data for user ==", user))
      
      print(plot)
      
  }
}

```

#### Show the missingness within each user and per group

```{r}
for (users_list_number in 1:length(list_of_user_index_lists)) {
  user_list_name<-names(list_of_user_index_lists[users_list_number])
  users_selection_index<-list_of_user_index_lists[[user_list_name]]
  
  print(user_list_name)
  
  # use the chosen index to get a list of users
  user_selection <- aumc_df$user_id[users_selection_index]

  ## Show missingness percentages
  all_missing_per_user <- data.frame()
  for (user in user_selection) {
    current_user_missing <- data.frame(user_id = user, t(colMeans(is.na(key_df[key_df$user_id == user, feature_selection]))*100))
    all_missing_per_user <- rbind(all_missing_per_user, current_user_missing)
  }
  
  print("All percentage missingness per user per feature")
  print(all_missing_per_user)
  
  print("Median percentage missingness per user")
  median_percent_miss <- data.frame(user_id = user_selection, median_percentage_missing = rowMedians(as.matrix(all_missing_per_user[, feature_selection])))
  print(median_percent_miss)
  
  print("Median of the Median percentage missingness per user")
  print(median(median_percent_miss$median_percentage_missing, na.rm=TRUE))
  cat("\n")

}
```

#### Plot the CRD and CCP of an example user

```{r, warning=FALSE, message=FALSE}
# subset users' keystroke data
key_sub_df <- key_df[key_df$user_id == 390, ]
# order based on user id and timestamp
key_sub_df <- key_sub_df[order(key_sub_df$user_id, key_sub_df$timestamp), ]
df <- df_user_list(df = key_sub_df, features = c("user_id", "timestamp", feature_selection), 
                           user_column_name = "user_id", users = 390)$ID_390

user <- df$user_id[1]
df <- subset(df, select = -c(user_id))
cat("Calculating Early Warning Signals and Cumulative Complexity Peaks for user == ", user, "\n")
# remove columns with too many NAs
df <- df[, colSums(is.na(df)) < nrow(df)/3]
vars <- colnames(df)[2:length(colnames(df))]
# Impute missing values with Classification And Regression Trees / Random Forests
# RF and CART return (identical) discrete numbers
imp.cart  <- mice::mice(df[vars], method = 'cart', remove.constant = TRUE, remove.collinear = TRUE, printFlag = FALSE)
df_imp  <- mice::complete(imp.cart)

# put each column between 0 and 1 using elastic scaler
elasc_df <- data.frame(apply(df_imp, 2, elascer))
# dynamic complexity of the variables with the imputed data
win = 28
dc <- dc_win(elasc_df, win = win, scale_min=0, scale_max=1, doPlot = FALSE, colOrder = NA)
datesIMP <- df$timestamp
ccp.caseIMP <- dc_ccp(df_win = dc, alpha_item = 0.001, alpha_time = 0.001)

plot_pre = TRUE

if (plot_pre == TRUE){
  # Plot the Complexity Resonance Diagram Plot
  dc <- dplyr::select(dc, names(sort(colMeans(dc, na.rm = TRUE))))
  plotDC_res_jms(df_win = dc, win = win, colOrder = TRUE, 
             useTimeVector = NA, #datesIMP, 
             timeStamp = "31-01-99",
             title = paste("Complexity Resonance Diagram user == ", user))
  # Plot the Cumulative Complexity Peak Plot
  plotDC_ccp_jms(df_ccp = ccp.caseIMP[, c(colnames(dc), "sig.peaks")], win = win, colOrder = TRUE, 
             useTimeVector = NA, #datesIMP, 
             timeStamp = "31-01-99",
             title = paste("Cumulative Complexity Peak Plot user == ", user))
}

```
