Sleep EMA CatBoost with Cross Validation¶
Replication Data for Wu et al. (2020) “Multi-Modal Data Collection for Measuring Health, Behavior, and Living Environment of Large-Scale Participant Cohorts: Conceptual Framework and Findings from Deployments”: Ecological Momentary Assessment Data (Beiwe)
[1]:
import pandas as pd
import numpy as np
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, cross_validate
[2]:
from catboost import CatBoostRegressor, Pool
import shap
shap.initjs()
[3]:
if "jms_style_sheet" in plt.style.available:
plt.style.use("jms_style_sheet")
[4]:
df = pd.read_csv("data/UT1000_ema_wide.csv").dropna().reset_index(drop=True)
display(df.head()); df.shape
pid | survey.date | content | energy | lonely | refreshed | restful | sad | sleep | stress | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1193rv5x | 2019-02-14 | 3.0 | 3.0 | 2.0 | 2.0 | 3.0 | 1.0 | 5.0 | 1.0 |
1 | 1193rv5x | 2019-02-15 | 2.0 | 3.0 | 3.0 | 1.0 | 2.0 | 2.0 | 6.0 | 1.0 |
2 | 1193rv5x | 2019-02-16 | 1.0 | 2.0 | 3.0 | 1.0 | 2.0 | 2.0 | 9.0 | 1.0 |
3 | 1193rv5x | 2019-02-17 | 1.0 | 1.0 | 3.0 | 2.0 | 2.0 | 3.0 | 9.0 | 2.0 |
4 | 1193rv5x | 2019-02-18 | 3.0 | 3.0 | 1.0 | 1.0 | 2.0 | 1.0 | 5.0 | 1.0 |
[4]:
(17705, 10)
[5]:
target="sleep"
feature_list=df.select_dtypes(float).drop(target, axis=1).columns.tolist()
_ = plt.figure(figsize=(5,3))
_ = sns.countplot(y=df[target])
[6]:
_ = sns.violinplot(y=df[target])
_ = sns.stripplot(y=df[target], edgecolor="white", linewidth=1)
[7]:
train_idx = df.sample(frac=0.8, random_state=42).index.tolist()
test_idx = df.drop(train_idx, axis=0).index.tolist()
X=df[feature_list].astype(int).astype("category")
y=df[target]
X_train = df.loc[train_idx, feature_list].astype(int).astype("category")
y_train = df.loc[train_idx, target].astype(int)
X_test = df.loc[test_idx, feature_list].astype(int).astype("category")
y_test = df.loc[test_idx, target].astype(int)
[8]:
model = CatBoostRegressor(iterations=500,
depth=None,
learning_rate=1,
loss_function='RMSE',
verbose=False)
# train the model
_ = model.fit(X_train, y_train, cat_features=feature_list)
[9]:
# create dataframe with importances per feature
feature_importance = pd.Series(dict(zip(feature_list, model.feature_importances_.round(2))))
feature_importance_df = pd.DataFrame(feature_importance.sort_values(ascending=False)).reset_index().rename(columns={"index": "feature", 0: "feature_importance"})
_ = plt.figure(figsize=(7, 3))
gini_plot = sns.barplot(data=feature_importance_df,
x="feature_importance",
y="feature")
_ = plt.title(f'Feature Importance')
[10]:
shap_values = model.get_feature_importance(Pool(X_test, label=y_test, cat_features=feature_list), type="ShapValues")
shap_values = shap_values[:,:-1]
_ = shap.summary_plot(shap_values,
X_test.astype(int),
feature_names=X_test.columns,
max_display=X_test.shape[1],
show=True)
[11]:
y_pred = model.predict(X_test)
df_test = pd.DataFrame({"y_pred": y_pred, target: y_test}).head(1000)
user_ids_first = df_test.head(1).index.tolist()[0]
user_ids_last = df_test.tail(1).index.tolist()[0]
_ = plt.figure(figsize=(30,8))
_ = plt.title(f"Catboost Regressor(predicted set) | RMSE = {round(np.sqrt(mean_squared_error(df_test['y_pred'], df_test[target])),4)} | bias Error = {round(np.mean(df_test['y_pred'] - df_test[target]), 4)}")
rmse_plot = plt.stem(df_test.index, df_test['y_pred'] - df_test[target], use_line_collection=True, linefmt='grey', markerfmt='D')
_ = plt.hlines(y=round(np.sqrt(mean_squared_error(df_test['y_pred'], df_test[target])),2), colors='b', linestyles='-.', label='+ RMSE',
xmin = user_ids_first,
xmax = user_ids_last
)
_ = plt.hlines(y=round(-np.sqrt(mean_squared_error(df_test['y_pred'], df_test[target])),2), colors='b', linestyles='-.', label='- RMSE',
xmin = user_ids_first,
xmax = user_ids_last
)
_ = plt.xticks(rotation=90, ticks=df_test.index)
_ = plt.ylabel(f"'Error = y_predicted - {target}'")
_ = plt.legend()
[12]:
accuracies_list = list()
all_pred_test_df = pd.DataFrame()
all_cors_df = pd.DataFrame()
kfold = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
fold_number = 1
model = CatBoostRegressor(iterations=500,
depth=None,
learning_rate=1,
loss_function='RMSE',
verbose=False)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X):
# select rows
train_X, test_X = X.loc[train_ix, :], X.loc[test_ix, :]
train_y, test_y = y.loc[train_ix], y.loc[test_ix]
# summarize train and test composition
train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
_ = model.fit(X = train_X,
y = train_y,
cat_features=X.columns.tolist())
pred_y = model.predict(test_X)
_ = accuracies_list.append(np.sqrt(mean_squared_error(test_y, pred_y)))
pred_test_df = pd.DataFrame({target: test_y,
"predict": pred_y,
"fold_number": f"fold_{fold_number}"})
all_pred_test_df = pd.concat([all_pred_test_df,
pred_test_df
])
fold_number += 1
[13]:
_ = plt.figure(figsize=(3,5))
_ = sns.boxplot(y = accuracies_list)
_ = sns.swarmplot(y = accuracies_list, edgecolor="white", linewidth=1)
_ = plt.title("RMSE Cat Boost\nRegressor kfold cross validation")
[14]:
pd.DataFrame(accuracies_list).describe().T
[14]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
0 | 100.0 | 1.022321 | 0.043976 | 0.891743 | 0.987997 | 1.020195 | 1.052509 | 1.133438 |