Author: James Twose | Data Scientist
Email: james@neurocast.nl
External partner: Amsterdam UMC
Date: October 2020
For literature on the NLTSA methods used in this notebook see the following reference:
Marwan, N., Romano, M. C., Thiel, M., & Kurths, J. (2007). Recurrence plots for the analysis of complex systems. Physics reports, 438(5-6), 237-329.
Merge the keystroke data with the clinical outcome data
df = pd.merge(aumc_df, key_df, on = "user_id", how = "inner")
Coerce the data type of all the questionnaire completion date columns to a datetime
df[df.filter(regex="Questionnaires_completiondate").columns]=df.filter(regex="Questionnaires_completiondate").apply(lambda x : pd.to_datetime(x, format="%d-%m-%Y").dt.date)
df[df.filter(regex="visitdate").columns]=df.filter(regex="visitdate").apply(lambda x : pd.to_datetime(x, format="%d-%m-%Y").dt.date)
key_check_df = pd.merge(key_df,
aumc_df.loc[:, aumc_df.filter(regex="completiondate").columns.tolist() + aumc_df.filter(regex="visitdate").columns.tolist() + ["user_id"]],
on = "user_id")
_select_variables=set(new_keystroke_agg_list) ^ set(variables_with_high_correlations) & set(new_keystroke_agg_list)
select_variables=list(_select_variables & set(key_check_df.drop("late_smartphone_use", axis=1).columns.tolist()))
all_users_rqa_measures_df = pd.DataFrame()
for user in key_check_df["user_id"].unique().tolist():
temp = key_check_df.loc[key_check_df["user_id"]==user, :]
if temp.shape[0] > 3:
with silence_stdout():
determinism_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
recurrence_rate=0.05).determinism(l_min=2))
determinism_column.name = "DET"
if temp.shape[0] > 3:
with silence_stdout():
laminarity_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
recurrence_rate=0.05).laminarity(v_min=2))
laminarity_column.name = "LAM"
if temp.shape[0] > 3:
with silence_stdout():
diag_entropy_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
recurrence_rate=0.05).diag_entropy(l_min=2))
diag_entropy_column.name = "ENT"
rp_df = pd.concat([pd.DataFrame([np.repeat(user, repeats=temp.shape[1])], index=["user_id"], columns=temp.columns.tolist()).T,
determinism_column, laminarity_column, diag_entropy_column
], axis=1)
all_users_rqa_measures_df = pd.concat([all_users_rqa_measures_df, rp_df])
all_users_rqa_measures_df.loc[select_variables, :]
cmap_name = "neurocast_colours"
colors = [
"#FFFFFF",
"#39509e"
]
n_bin = 10
cm = LinearSegmentedColormap.from_list(cmap_name, colors, N=n_bin)
sns.set_style("whitegrid")
user=390
temp = key_check_df.loc[key_check_df["user_id"]==user, :]
example_feature = "flight_time_MEDIAN"
x = temp.loc[:, example_feature]
example_rm = RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2, recurrence_rate=0.05).recurrence_matrix()
# rotate the matrix 90 degrees so it starts at day 0 in the bottom left corner
new_matrix = [[example_rm[j][i] for j in range(len(example_rm))] for i in range(len(example_rm[0])-1,-1,-1)]
fig, axes = plt.subplots(2, 1, figsize=(7, 9), gridspec_kw={'height_ratios': [1, 3]})
sns.despine(left=True)
axe = sns.lineplot(x=np.arange(0, temp.loc[:, example_feature].shape[0]),
y=temp.loc[:, example_feature],
color = "#39509e",
ax=axes[0])
axe.set_ylabel(r'$\tilde{u}\left(FT_{n}\right) \, \, \left[\mathrm{ms} \right]$')
yticks = list(np.linspace(0, len(x)-3, 20, dtype=np.int))
_xticklabels = list(np.arange(0, len(x), 1))
xticklabels = [_xticklabels[idx] for idx in yticks]
_yticklabels = list(np.arange(len(x)-3, -1, -1))
yticklabels = [_yticklabels[idx] for idx in yticks]
ax = sns.heatmap(new_matrix, cmap=cm, ax=axes[1], cbar=False, xticklabels = xticklabels, yticklabels = yticklabels
)
ax.set_xticks(yticks)
ax.set_yticks(yticks)
_ = plt.tight_layout()
det_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
all_users_rqa_measures_df.dropna(axis=0).index).describe()["DET"].sort_values(by = "50%", ascending=False).head(10)
lam_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
all_users_rqa_measures_df.dropna(axis=0).index).describe()["LAM"].sort_values(by = "50%", ascending=False).head(10)
ent_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
all_users_rqa_measures_df.dropna(axis=0).index).describe()["ENT"].sort_values(by = "50%", ascending=False).head(10)
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = det_summary_df.index, x = det_summary_df["50%"], xerr = det_summary_df["std"], linestyle='None', marker='^')
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = lam_summary_df.index, x = lam_summary_df["50%"], xerr = lam_summary_df["std"], linestyle='None', marker='^')
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = ent_summary_df.index, x = ent_summary_df["50%"], xerr = ent_summary_df["std"], linestyle='None', marker='^')
top_rqa_meas = pd.concat([det_summary_df, lam_summary_df, ent_summary_df]).index.unique().tolist()
feature_list = [x for x in top_rqa_meas if not 'MIN' in x]
feature_list
corr_df = key_check_df.loc[:, feature_list].pct_change().corr()
corr_df[corr_df > 0.2].head()
feature_list = [x for x in top_rqa_meas if not 'hold_time_MEDIAN' in x]
feature_list
user = 389
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)
fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('User with MS and changes in MRI', y=0.895)
axs = axs.ravel()
for i in range(0, des_df.shape[1]*2, 2):
axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#39509e")
axs[i].set(title="", xlabel="ms")
axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#39509e")
axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
user = 390
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)
fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('User with MS and no changes in MRI', y=0.895)
axs = axs.ravel()
for i in range(0, des_df.shape[1]*2, 2):
axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#2E3135")
axs[i].set(title="", xlabel="ms")
axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#2E3135")
axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
user = 384 #373 391
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)
fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('Healthy Control User', y=0.895)
axs = axs.ravel()
for i in range(0, des_df.shape[1]*2, 2):
axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#F0871B")
axs[i].set(title="", xlabel="ms")
axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#F0871B")
axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
impute_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)
estimator = make_pipeline(
IterativeImputer(random_state=0, estimator=impute_estimator)
)
dfs_dict = dict()
for user in [389, 390, 384]:
user_imp_df = pd.DataFrame(estimator.fit_transform(key_check_df.loc[key_check_df["user_id"]==user, feature_list].values),
index = key_check_df.loc[key_check_df["user_id"]==user, feature_list].index,
columns = feature_list)
scal_user_df = pd.DataFrame(MinMaxScaler().fit_transform(user_imp_df), index = key_check_df.loc[key_check_df["user_id"]==user, feature_list].index,
columns = feature_list)
_ = scal_user_df.reset_index(drop=True, inplace=True)
dfs_dict[user] = scal_user_df
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)
bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95
feature_selection = ["hold_time_STD",
"flight_time_MEDIAN",
"session_duration_MEAN_2CD",
"flight_time_MAX",
]
for i in range(0, len(dfs_dict)):
user=[389, 390, 384][i]
title=['User with MS and changes in MRI',
'User with MS and no changes in MRI',
'Healthy Control User'][i]
df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
palette=cmap_name)
g.set_title(title, fontsize=13)
g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1],
bins=bin_amount, legend=None, palette=cmap_name, stat="density")
_ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45),
fontsize=13, ncol=4, fancybox=True, shadow=True)
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)
bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95
feature_selection = ["flight_time_MEAN_ABS_CHANGE",
"after_punctuation_pause_MEDIAN",
"hold_time_SKEW",
"pre_correction_slowing_MEAN_2CD"
]
for i in range(0, len(dfs_dict)):
user=[389, 390, 384][i]
title=['User with MS and changes in MRI',
'User with MS and no changes in MRI',
'Healthy Control User'][i]
df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
palette=cmap_name)
g.set_title(title, fontsize=13)
g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1],
bins=bin_amount, legend=None, palette=cmap_name, stat="density")
_ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45),
fontsize=13, ncol=4, fancybox=True, shadow=True)
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)
bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95
feature_selection = ["post_correction_slowing_MEAN_2CD",
"correction_duration_MEAN_2CD",
"post_correction_slowing_P_AUTO_CORR",
"after_punctuation_pause_MEAN_CHANGE"
]
for i in range(0, len(dfs_dict)):
user=[389, 390, 384][i]
title=['User with MS and changes in MRI',
'User with MS and no changes in MRI',
'Healthy Control User'][i]
df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
palette=cmap_name)
g.set_title(title, fontsize=13)
g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1],
bins=bin_amount, legend=None, palette=cmap_name, stat="density")
_ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45),
fontsize=13, ncol=4, fancybox=True, shadow=True)
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)
bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95
feature_selection = ["after_punctuation_pause_MEAN_2CD",
"press_press_latency_MIN",
"correction_duration_MIN" ]
for i in range(0, len(dfs_dict)):
user=[389, 390, 384][i]
title=['User with MS and changes in MRI',
'User with MS and no changes in MRI',
'Healthy Control User'][i]
df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
palette=cmap_name)
g.set_title(title, fontsize=13)
g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1],
bins=bin_amount, legend=None, palette=cmap_name, stat="density")
_ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45),
fontsize=13, ncol=4, fancybox=True, shadow=True)