Supplemental material: Early-Warning Signals for Disease Activity in Patients Diagnosed with Multiple Sclerosis based on Keystroke Dynamics

Author: James Twose | Data Scientist
Email: james@neurocast.nl
External partner: Amsterdam UMC
Date: October 2020

The following script is the feature selection via RQA relative to subsection 3.B.

For literature on the NLTSA methods used in this notebook see the following reference:
Marwan, N., Romano, M. C., Thiel, M., & Kurths, J. (2007). Recurrence plots for the analysis of complex systems. Physics reports, 438(5-6), 237-329.

Merge the keystroke data with the clinical outcome data

In [20]:
df = pd.merge(aumc_df, key_df, on = "user_id", how = "inner")

Coerce the data type of all the questionnaire completion date columns to a datetime

In [21]:
df[df.filter(regex="Questionnaires_completiondate").columns]=df.filter(regex="Questionnaires_completiondate").apply(lambda x : pd.to_datetime(x, format="%d-%m-%Y").dt.date)
df[df.filter(regex="visitdate").columns]=df.filter(regex="visitdate").apply(lambda x : pd.to_datetime(x, format="%d-%m-%Y").dt.date)
In [22]:
key_check_df = pd.merge(key_df, 
         aumc_df.loc[:, aumc_df.filter(regex="completiondate").columns.tolist() + aumc_df.filter(regex="visitdate").columns.tolist() + ["user_id"]],
         on = "user_id")

Getting RQA summaries for each person

In [23]:
_select_variables=set(new_keystroke_agg_list) ^ set(variables_with_high_correlations) & set(new_keystroke_agg_list)
select_variables=list(_select_variables & set(key_check_df.drop("late_smartphone_use", axis=1).columns.tolist()))
all_users_rqa_measures_df = pd.DataFrame()
for user in key_check_df["user_id"].unique().tolist():
    temp = key_check_df.loc[key_check_df["user_id"]==user, :]
    if temp.shape[0] > 3:
        with silence_stdout():
            determinism_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
                                                                                            recurrence_rate=0.05).determinism(l_min=2))
        determinism_column.name = "DET"
    if temp.shape[0] > 3:
        with silence_stdout():
            laminarity_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
                                                                                            recurrence_rate=0.05).laminarity(v_min=2))
        laminarity_column.name = "LAM"
    if temp.shape[0] > 3:
        with silence_stdout():
            diag_entropy_column = temp.loc[:, select_variables].apply(lambda x: RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2,
                                                                                            recurrence_rate=0.05).diag_entropy(l_min=2))
        diag_entropy_column.name = "ENT"
    rp_df = pd.concat([pd.DataFrame([np.repeat(user, repeats=temp.shape[1])], index=["user_id"], columns=temp.columns.tolist()).T,
                                     determinism_column, laminarity_column, diag_entropy_column
                      ], axis=1)
    all_users_rqa_measures_df = pd.concat([all_users_rqa_measures_df, rp_df])
In [24]:
all_users_rqa_measures_df.loc[select_variables, :]
Out[24]:
user_id DET LAM ENT
session_duration_STRIKE_BLW_MEAN 363 0.000000 0.207792 -0.000000e+00
session_duration_STRIKE_BLW_MEAN 364 0.071315 0.171682 9.608452e-01
session_duration_STRIKE_BLW_MEAN 365 0.089779 0.130791 9.928892e-01
session_duration_STRIKE_BLW_MEAN 366 0.046644 0.129829 9.711340e-01
session_duration_STRIKE_BLW_MEAN 367 0.092639 0.254776 6.538157e-01
... ... ... ... ...
absolute_amount_kind_keystrokes_numberKeystroke 484 0.094937 0.173594 4.101163e-01
absolute_amount_kind_keystrokes_numberKeystroke 485 0.054054 0.089109 1.666667e-09
absolute_amount_kind_keystrokes_numberKeystroke 486 0.083333 0.056250 6.931472e-01
absolute_amount_kind_keystrokes_numberKeystroke 487 0.133929 0.046053 4.101163e-01
absolute_amount_kind_keystrokes_numberKeystroke 488 0.111111 0.280000 5.000000e-09

17388 rows × 4 columns

In [25]:
cmap_name = "neurocast_colours"
colors = [ 
    "#FFFFFF",
    "#39509e"
          ]
n_bin = 10
cm = LinearSegmentedColormap.from_list(cmap_name, colors, N=n_bin)
sns.set_style("whitegrid")
In [26]:
user=390
temp = key_check_df.loc[key_check_df["user_id"]==user, :]
example_feature = "flight_time_MEDIAN"
x = temp.loc[:, example_feature]
example_rm = RecurrencePlot(time_series=x.values, metric="manhattan", dim=3, tau=2, recurrence_rate=0.05).recurrence_matrix()

# rotate the matrix 90 degrees so it starts at day 0 in the bottom left corner
new_matrix = [[example_rm[j][i] for j in range(len(example_rm))] for i in range(len(example_rm[0])-1,-1,-1)]

fig, axes = plt.subplots(2, 1, figsize=(7, 9), gridspec_kw={'height_ratios': [1, 3]}) 
sns.despine(left=True)

axe = sns.lineplot(x=np.arange(0, temp.loc[:, example_feature].shape[0]), 
                 y=temp.loc[:, example_feature], 
                color = "#39509e",
                ax=axes[0])

axe.set_ylabel(r'$\tilde{u}\left(FT_{n}\right) \, \, \left[\mathrm{ms} \right]$')

yticks = list(np.linspace(0, len(x)-3, 20, dtype=np.int))
_xticklabels = list(np.arange(0, len(x), 1))
xticklabels = [_xticklabels[idx] for idx in yticks]
_yticklabels = list(np.arange(len(x)-3, -1, -1))
yticklabels = [_yticklabels[idx] for idx in yticks]

ax = sns.heatmap(new_matrix, cmap=cm, ax=axes[1], cbar=False, xticklabels = xticklabels, yticklabels = yticklabels
                )
ax.set_xticks(yticks)
ax.set_yticks(yticks)

_ = plt.tight_layout()
Calculating recurrence plot at fixed recurrence rate...
Calculating the manhattan distance matrix...

Show the most interesting features according to the RQA measures

In [28]:
det_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
    all_users_rqa_measures_df.dropna(axis=0).index).describe()["DET"].sort_values(by = "50%", ascending=False).head(10)
lam_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
    all_users_rqa_measures_df.dropna(axis=0).index).describe()["LAM"].sort_values(by = "50%", ascending=False).head(10)
ent_summary_df = all_users_rqa_measures_df.drop(["user_id"], axis=1).dropna(axis=0).groupby(
    all_users_rqa_measures_df.dropna(axis=0).index).describe()["ENT"].sort_values(by = "50%", ascending=False).head(10)
In [29]:
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = det_summary_df.index, x = det_summary_df["50%"], xerr = det_summary_df["std"], linestyle='None', marker='^')
In [30]:
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = lam_summary_df.index, x = lam_summary_df["50%"], xerr = lam_summary_df["std"], linestyle='None', marker='^')
In [31]:
_ = plt.figure(figsize=(5,3))
_ = plt.errorbar(y = ent_summary_df.index, x = ent_summary_df["50%"], xerr = ent_summary_df["std"], linestyle='None', marker='^')

Chosen keystroke features

In [33]:
top_rqa_meas = pd.concat([det_summary_df, lam_summary_df, ent_summary_df]).index.unique().tolist()
feature_list = [x for x in top_rqa_meas if not 'MIN' in x]
feature_list
Out[33]:
['post_correction_slowing_MEAN_2CD',
 'pre_correction_slowing_MEAN_2CD',
 'hold_time_MEDIAN',
 'hold_time_STD',
 'flight_time_MEDIAN',
 'flight_time_MEAN_ABS_CHANGE',
 'hold_time_SKEW',
 'correction_duration_MEAN_2CD',
 'post_correction_slowing_P_AUTO_CORR',
 'after_punctuation_pause_MEAN_CHANGE',
 'after_punctuation_pause_MEAN_2CD',
 'session_duration_MEAN_2CD',
 'after_punctuation_pause_MEDIAN',
 'flight_time_MAX']

Remove any keystroke feature that has a high correlation in the percentage change

In [34]:
corr_df = key_check_df.loc[:, feature_list].pct_change().corr()

corr_df[corr_df > 0.2].head()
Out[34]:
post_correction_slowing_MEAN_2CD pre_correction_slowing_MEAN_2CD hold_time_MEDIAN hold_time_STD flight_time_MEDIAN flight_time_MEAN_ABS_CHANGE hold_time_SKEW correction_duration_MEAN_2CD post_correction_slowing_P_AUTO_CORR after_punctuation_pause_MEAN_CHANGE after_punctuation_pause_MEAN_2CD session_duration_MEAN_2CD after_punctuation_pause_MEDIAN flight_time_MAX
post_correction_slowing_MEAN_2CD 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
pre_correction_slowing_MEAN_2CD NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
hold_time_MEDIAN NaN NaN 1.000000 0.869314 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
hold_time_STD NaN NaN 0.869314 1.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
flight_time_MEDIAN NaN NaN NaN NaN 1.0 0.261589 NaN NaN NaN NaN NaN NaN NaN NaN

Here hold_time_MEDIAN has been chosen to be removed

In [35]:
feature_list = [x for x in top_rqa_meas if not 'hold_time_MEDIAN' in x]
feature_list
Out[35]:
['post_correction_slowing_MEAN_2CD',
 'pre_correction_slowing_MEAN_2CD',
 'hold_time_STD',
 'flight_time_MEDIAN',
 'flight_time_MEAN_ABS_CHANGE',
 'hold_time_SKEW',
 'correction_duration_MEAN_2CD',
 'post_correction_slowing_P_AUTO_CORR',
 'after_punctuation_pause_MEAN_CHANGE',
 'after_punctuation_pause_MEAN_2CD',
 'session_duration_MEAN_2CD',
 'after_punctuation_pause_MEDIAN',
 'press_press_latency_MIN',
 'flight_time_MAX',
 'correction_duration_MIN']

The following are descriptive plots for the chosen features on a user level basis

In [37]:
user = 389
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)

fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('User with MS and changes in MRI', y=0.895)
axs = axs.ravel()

for i in range(0, des_df.shape[1]*2, 2):
    axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#39509e")
    axs[i].set(title="", xlabel="ms")
    axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#39509e")
    axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
In [38]:
user = 390
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)

fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('User with MS and no changes in MRI', y=0.895)
axs = axs.ravel()

for i in range(0, des_df.shape[1]*2, 2):
    axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#2E3135")
    axs[i].set(title="", xlabel="ms")
    axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#2E3135")
    axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
In [39]:
user = 384 #373 391
des_df = df.loc[df["user_id"]==user, feature_list]
_ = des_df.reset_index(drop=True, inplace=True)

fig, axs = plt.subplots(nrows=des_df.shape[1], ncols=2, figsize=(15,25), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 1.0, wspace=.1)
fig.suptitle('Healthy Control User', y=0.895)
axs = axs.ravel()

for i in range(0, des_df.shape[1]*2, 2):
    axs[i].plot(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], color="#F0871B")
    axs[i].set(title="", xlabel="ms")
    axs[i+1].hist(des_df.loc[:, des_df.columns.tolist()[int(i/2)]], bins=50, color="#F0871B")
    axs[i+1].set(title="", xlabel=des_df.columns.tolist()[int(i/2)])
In [40]:
impute_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)
estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=impute_estimator)
    )

dfs_dict = dict()
for user in [389, 390, 384]:
    user_imp_df = pd.DataFrame(estimator.fit_transform(key_check_df.loc[key_check_df["user_id"]==user, feature_list].values), 
                               index = key_check_df.loc[key_check_df["user_id"]==user, feature_list].index, 
                               columns = feature_list)

    scal_user_df = pd.DataFrame(MinMaxScaler().fit_transform(user_imp_df), index = key_check_df.loc[key_check_df["user_id"]==user, feature_list].index, 
                               columns = feature_list)
    _ = scal_user_df.reset_index(drop=True, inplace=True)
    dfs_dict[user] = scal_user_df
In [44]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)

bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95

feature_selection = ["hold_time_STD", 
                     "flight_time_MEDIAN", 
                     "session_duration_MEAN_2CD", 
                     "flight_time_MAX", 
                    ]

for i in range(0, len(dfs_dict)):
    user=[389, 390, 384][i]
    title=['User with MS and changes in MRI',
          'User with MS and no changes in MRI',
          'Healthy Control User'][i]
    
    df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
    g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
                     palette=cmap_name)
    g.set_title(title, fontsize=13)
    g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1], 
                     bins=bin_amount, legend=None, palette=cmap_name, stat="density")
    _ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45), 
               fontsize=13, ncol=4, fancybox=True, shadow=True)
In [46]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)

bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95

feature_selection = ["flight_time_MEAN_ABS_CHANGE",
                     "after_punctuation_pause_MEDIAN",
                     "hold_time_SKEW",
                     "pre_correction_slowing_MEAN_2CD"
                    ]

for i in range(0, len(dfs_dict)):
    user=[389, 390, 384][i]
    title=['User with MS and changes in MRI',
          'User with MS and no changes in MRI',
          'Healthy Control User'][i]
    
    df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
    g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
                     palette=cmap_name)
    g.set_title(title, fontsize=13)
    g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1], 
                     bins=bin_amount, legend=None, palette=cmap_name, stat="density")
    _ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45), 
               fontsize=13, ncol=4, fancybox=True, shadow=True)
In [47]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)

bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95

feature_selection = ["post_correction_slowing_MEAN_2CD",
                     "correction_duration_MEAN_2CD",
                     "post_correction_slowing_P_AUTO_CORR",
                     "after_punctuation_pause_MEAN_CHANGE"
                    ]

for i in range(0, len(dfs_dict)):
    user=[389, 390, 384][i]
    title=['User with MS and changes in MRI',
          'User with MS and no changes in MRI',
          'Healthy Control User'][i]
    
    df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
    g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
                     palette=cmap_name)
    g.set_title(title, fontsize=13)
    g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1], 
                     bins=bin_amount, legend=None, palette=cmap_name, stat="density")
    _ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45), 
               fontsize=13, ncol=4, fancybox=True, shadow=True)
In [48]:
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(16,10), gridspec_kw={'width_ratios': [4, 1]})
fig.subplots_adjust(hspace = 0.3, wspace=.1)

bin_amount=50
cmap_name="Dark2"
cmap=plt.get_cmap(cmap_name)
title_dist = 0.95

feature_selection = ["after_punctuation_pause_MEAN_2CD",
                     "press_press_latency_MIN",
                     "correction_duration_MIN"                    ]

for i in range(0, len(dfs_dict)):
    user=[389, 390, 384][i]
    title=['User with MS and changes in MRI',
          'User with MS and no changes in MRI',
          'Healthy Control User'][i]
    
    df_for_plot = dfs_dict[user].loc[:, feature_selection].reindex(sorted(dfs_dict[user].loc[:, feature_selection].columns), axis=1)
    g = sns.lineplot(data=df_for_plot, ax=axs[i,0], dashes=False, alpha=0.8, legend=None,
                     palette=cmap_name)
    g.set_title(title, fontsize=13)
    g = sns.histplot(data=df_for_plot.melt(), x="value", hue="variable", kde=True, ax=axs[i,1], 
                     bins=bin_amount, legend=None, palette=cmap_name, stat="density")
    _ = axs[i,1].set_ylim(0,6)
_ = plt.legend(sorted(df_for_plot.columns.tolist(), reverse=True), loc="lower center", bbox_to_anchor=(-1.65, -0.45), 
               fontsize=13, ncol=4, fancybox=True, shadow=True)