Methodology¶

The expected content of the notebook:¶

- Detection of time series changes in daily questionnaire data about fatigue

Data - daily questionnaire data of James¶

[1]:

# import sys
# import os
# import warnings

import pandas as pd
# from pandas.io.json import json_normalize #package for flattening json in pandas df
# from ast import literal_eval
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
# import CRD_functions as crd

from jmspack.NLTSA import (ts_levels,
                           distribution_uniformity,
                           fluctuation_intensity,
                           complexity_resonance,
                           complexity_resonance_diagram,
                          cumulative_complexity_peaks,
                          cumulative_complexity_peaks_plot)
import miceforest as mf

import session_info

Display the session info of the notebook¶

[2]:

session_info.show()

[2]:

Click to view session information

-----
jmspack             0.0.3
matplotlib          3.3.4
miceforest          NA
numpy               1.19.2
pandas              1.2.3
seaborn             0.11.1
session_info        1.0.0
sklearn             0.24.1
-----

Click to view modules imported as dependencies

PIL                 8.1.2
appnope             0.1.2
backcall            0.2.0
cffi                1.14.5
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
decorator           4.4.2
ipykernel           5.3.4
ipython_genutils    0.2.0
ipywidgets          7.6.3
jedi                0.17.2
joblib              0.17.0
kiwisolver          1.3.1
mpl_toolkits        NA
parso               0.7.0
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
prompt_toolkit      3.0.8
ptyprocess          0.7.0
pyexpat             NA
pygments            2.8.1
pyparsing           2.4.7
pytz                2021.1
scipy               1.5.3
six                 1.15.0
statsmodels         0.12.2
storemagic          NA
tornado             6.1
traitlets           5.0.5
wcwidth             0.2.5
zmq                 20.0.0

-----
IPython             7.21.0
jupyter_client      6.1.7
jupyter_core        4.7.1
jupyterlab          2.2.6
notebook            6.2.0
-----
Python 3.9.2 (default, Mar  3 2021, 11:58:52) [Clang 10.0.0 ]
macOS-10.16-x86_64-i386-64bit
-----
Session information updated at 2021-07-18 16:14

[3]:

df = (pd.read_csv("data/user_351_smart_panel.csv")
      .rename(columns={"Unnamed: 0": "date"})
      .assign(date=lambda x: pd.to_datetime(x["date"]).dt.date)
      .drop("user_id", axis=1)
      .set_index("date")
     )

[4]:

date_range = pd.date_range(df.reset_index().date.min(),
             df.reset_index().date.max())

[5]:

df

[5]:

	validated_physical	validated_cognitive	validated_motivational	fatigue	slider
date
2020-07-15	1.496997	NaN	NaN	1.385599	NaN
2020-07-18	NaN	3.428084	NaN	3.334602	NaN
2020-07-19	NaN	0.664916	NaN	0.646784	NaN
2020-07-20	NaN	NaN	1.532818	1.294205	NaN
2020-07-21	1.496997	NaN	0.766409	2.032702	NaN
...	...	...	...	...	...
2021-05-27	NaN	NaN	2.938475	2.481045	NaN
2021-06-01	NaN	NaN	NaN	NaN	63.0
2021-06-07	NaN	1.951801	NaN	1.898576	NaN
2021-06-08	NaN	NaN	2.938475	2.481045	57.0
2021-06-13	NaN	1.951801	NaN	1.898576	NaN

204 rows × 5 columns

[6]:

df = df.reindex(date_range)

[7]:

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 334 entries, 2020-07-15 to 2021-06-13
Freq: D
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   validated_physical      86 non-null     float64
 1   validated_cognitive     113 non-null    float64
 2   validated_motivational  77 non-null     float64
 3   fatigue                 190 non-null    float64
 4   slider                  62 non-null     float64
dtypes: float64(5)
memory usage: 15.7 KB

[8]:

_ = plt.figure(figsize=(30, 7))
_ = sns.heatmap(data=df
                .drop("slider", axis=1)
#                 .assign(date=lambda x: pd.to_datetime(x["date"]).dt.date)
#                 .set_index("date")
                .T
               )

_images/fatigue_change_over_time_9_0.png

[9]:

current_feature = "fatigue"

[10]:

# _ = df.set_index("date", inplace=True)

Impute the missing data¶

Looking at linear interpolation first¶

[11]:

ts = df[current_feature].interpolate(method="polynomial", order=1)

[12]:

_ = plt.figure(figsize=(20, 5))
_ = plt.plot(ts, label=current_feature)
_ = plt.scatter(ts.index, ts.values, c="red")
_ = plt.scatter(df[current_feature].index, df[current_feature].values, c="green")
_ = plt.xticks(rotation=90)
_ = plt.legend()

_images/fatigue_change_over_time_15_0.png

Deciding on the MICE algorithm in the end¶

[13]:

# Create kernel.
kds = mf.KernelDataSet(
  df,
  save_all_iterations=True,
  random_state=1991
)

# Run the MICE algorithm for N amount of iterations
kds.mice(iterations = 10)

# Return the completed kernel data
completed_data = kds.complete_data()

[14]:

ts = completed_data[current_feature]

[15]:

_ = plt.figure(figsize=(20, 5))
_ = plt.plot(ts, label=current_feature)
_ = plt.scatter(ts.index, ts.values, c="red")
_ = plt.scatter(df[current_feature].index, df[current_feature].values, c="green")
_ = plt.xticks(rotation=90)
_ = plt.legend()

_images/fatigue_change_over_time_19_0.png

[16]:

ts_levels_df, _, _ = ts_levels(ts=ts.values,
    ts_x=ts.index,
    criterion='mse',
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    max_leaf_nodes=30,
    plot=False,
    equal_spaced=True,
    n_x_ticks=10,
    figsize=(20, 5))

[17]:

plot_df = ts_levels_df.drop("t_steps", axis=1).melt("ts_x")

[18]:

_ = plt.figure(figsize=(20, 5))
_ = sns.lineplot(data=plot_df, x="ts_x", y="value", hue="variable")

_images/fatigue_change_over_time_22_0.png

[19]:

def change_profile(x, window_size):
#     window_size = 5
#     x=ts
    window_range = np.arange(0, len(x)-window_size, window_size)

    cp_df = pd.DataFrame()
    for window_begin in window_range:
        window_mean = x.iloc[window_begin: window_begin + window_size].mean()
        window_sum = x.iloc[window_begin: window_begin + window_size].sum()

#         current_cp_df = x.iloc[window_begin: window_begin + window_size] - window_mean

        current_cp_df = (x.iloc[window_begin: window_begin + window_size] - window_sum) / window_size
        cp_df = pd.concat([cp_df, current_cp_df])

    return cp_df[0]

[20]:

ts_cp = change_profile(ts, window_size=6)

[21]:

ts_levels_df, _, _ = ts_levels(ts=ts_cp.values,
    ts_x=ts_cp.index,
    criterion='mse',
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    max_leaf_nodes=30,
    plot=False,
    equal_spaced=True,
    n_x_ticks=10,
    figsize=(20, 5))

[22]:

plot_df = ts_levels_df.drop("t_steps", axis=1).melt("ts_x")

[23]:

_ = plt.figure(figsize=(20, 5))
_ = sns.lineplot(data=plot_df, x="ts_x", y="value", hue="variable")
_ = plt.ylabel("Fatigue Change Profile")
_ = plt.xlabel("Date")

_images/fatigue_change_over_time_27_0.png

[24]:

plot_df = (ts_levels_df
 .drop(["t_steps"], axis=1)
 .rename(columns={"original_ts": "Fatigue_Change_Profile",
                  "ts_levels": "Fatigue_levels",
                  "ts_x": "Date"})
           .assign(Date=lambda x: x["Date"].astype(str))
 .set_index("Date")
)

[25]:

_ = plt.figure(figsize=(20, 3))
_ = sns.heatmap(plot_df.T)

_images/fatigue_change_over_time_29_0.png

[26]:

fi_df = pd.DataFrame(fluctuation_intensity(pd.DataFrame(ts_cp), win=7, xmin=0, xmax=1, col_first=1, col_last=1)).rename(columns={0: current_feature})

[27]:

di_df = pd.DataFrame(distribution_uniformity(pd.DataFrame(ts_cp), win=7, xmin=0, xmax=1, col_first=1, col_last=1)).rename(columns={0: current_feature})

[28]:

cr_df = complexity_resonance(fi_df, di_df).rename(columns={0: current_feature})

[29]:

_ = complexity_resonance_diagram(fi_df, cmap_n=12, plot_title='Fluctuation Intensity Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_33_0.png

[30]:

_ = complexity_resonance_diagram(di_df, cmap_n=12, plot_title='Distribution Uniformity Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_34_0.png

[31]:

_ = complexity_resonance_diagram(cr_df, cmap_n=12, plot_title='Complexity Resonance Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_35_0.png

[32]:

cumulative_complexity_peaks_df, significant_peaks_df = cumulative_complexity_peaks(df=cr_df,
                                                                                    significant_level_item = 0.05,
    significant_level_time = 0.05)

[33]:

_ = cumulative_complexity_peaks_plot(cumulative_complexity_peaks_df=cumulative_complexity_peaks_df,
                                     significant_peaks_df=significant_peaks_df,
                                    figsize = (20, 3),
    height_ratios = [1, 3],
    labels_n = 7)

_images/fatigue_change_over_time_37_0.png

[34]:

significant_peaks_df[significant_peaks_df["Significant CCPs"]==1]

[34]:

	Significant CCPs
2020-07-21	1.0
2020-07-22	1.0
2020-07-23	1.0
2020-07-24	1.0
2020-08-14	1.0
2020-08-15	1.0
2020-09-23	1.0
2020-10-03	1.0
2020-10-15	1.0
2020-10-16	1.0
2020-12-04	1.0
2020-12-05	1.0
2020-12-06	1.0
2020-12-13	1.0
2020-12-14	1.0
2020-12-15	1.0
2020-12-16	1.0
2020-12-17	1.0
2021-01-29	1.0
2021-01-30	1.0
2021-01-31	1.0
2021-04-20	1.0
2021-04-21	1.0
2021-04-22	1.0
2021-05-30	1.0
2021-05-31	1.0

Running the CRDs on all the columns out of curiosity (not very reliable due to amount of missingness)¶

[35]:

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 334 entries, 2020-07-15 to 2021-06-13
Freq: D
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   validated_physical      86 non-null     float64
 1   validated_cognitive     113 non-null    float64
 2   validated_motivational  77 non-null     float64
 3   fatigue                 190 non-null    float64
 4   slider                  62 non-null     float64
dtypes: float64(5)
memory usage: 23.8 KB

[36]:

completed_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 334 entries, 2020-07-15 to 2021-06-13
Freq: D
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   validated_physical      334 non-null    float64
 1   validated_cognitive     334 non-null    float64
 2   validated_motivational  334 non-null    float64
 3   fatigue                 334 non-null    float64
 4   slider                  334 non-null    float64
dtypes: float64(5)
memory usage: 23.8 KB

[37]:

# completed_data.describe()

[38]:

scal_df = pd.DataFrame(MinMaxScaler().fit_transform(completed_data),
             columns = completed_data.columns,
            index = completed_data.index)

[39]:

fi_df = pd.DataFrame(fluctuation_intensity(scal_df, win=7, xmin=0, xmax=1, col_first=1, col_last=completed_data.shape[1]))#.rename(columns={0: current_feature})

[40]:

di_df = pd.DataFrame(distribution_uniformity(scal_df, win=7, xmin=0, xmax=1, col_first=1, col_last=completed_data.shape[1]))#.rename(columns={0: current_feature})

[41]:

cr_df = complexity_resonance(fi_df, di_df)#.rename(columns={0: current_feature})

[42]:

_ = complexity_resonance_diagram(fi_df, cmap_n=12, plot_title='Fluctuation Intensity Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_48_0.png

[43]:

_ = complexity_resonance_diagram(di_df, cmap_n=12, plot_title='Distribution Uniformity Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_49_0.png

[44]:

_ = complexity_resonance_diagram(cr_df, cmap_n=12, plot_title='Complexity Resonance Diagram', labels_n=7, figsize=(20, 3))

_images/fatigue_change_over_time_50_0.png

[45]:

cumulative_complexity_peaks_df, significant_peaks_df = cumulative_complexity_peaks(df=cr_df,
                                                                                    significant_level_item = 0.05,
    significant_level_time = 0.05)

[46]:

_ = cumulative_complexity_peaks_plot(cumulative_complexity_peaks_df=cumulative_complexity_peaks_df,
                                     significant_peaks_df=significant_peaks_df,
                                    figsize = (20, 3),
    height_ratios = [1, 3],
    labels_n = 7)

_images/fatigue_change_over_time_52_0.png

[ ]: