PIMA diabetes Classification¶
[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import session_info
from jmspack.utils import (apply_scaling,
JmsColors
)
from ml_class import (plot_cv_indices,
plot_decision_boundary,
plot_learning_curve,
multi_roc_auc_plot,
dict_of_models,
RFE_opt_rf,
make_confusion_matrix)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
GradientBoostingClassifier)
from sklearn.model_selection import (GridSearchCV,
RepeatedStratifiedKFold,
cross_val_score,
cross_validate,
train_test_split,
KFold)
from sklearn.feature_selection import RFE
from sklearn.metrics import (confusion_matrix,
roc_curve,
auc)
from imblearn.over_sampling import (SMOTE,
SVMSMOTE,
KMeansSMOTE)
import shap
[2]:
# init the JS visualization code
shap.initjs()
[3]:
session_info.show(req_file_name="pima-requirements.txt",
write_req_file=False) #add write_req_file=True to function to get requirements.txt file of packages used
[3]:
Click to view session information
----- imblearn 0.8.1 jmspack 0.1.0 matplotlib 3.4.3 ml_class NA numpy 1.21.4 pandas 1.3.4 seaborn 0.11.2 session_info 1.0.0 shap 0.40.0 sklearn 1.0.1 -----
Click to view modules imported as dependencies
PIL 8.4.0 anyio NA appnope 0.1.2 attr 21.2.0 babel 2.9.1 backcall 0.2.0 beta_ufunc NA binom_ufunc NA brotli NA certifi 2021.10.08 cffi 1.15.0 chardet 4.0.0 charset_normalizer 2.0.0 cloudpickle 2.0.0 colorama 0.4.4 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 debugpy 1.5.1 decorator 5.1.0 defusedxml 0.7.1 entrypoints 0.3 idna 3.1 importlib_resources NA ipykernel 6.5.0 ipython_genutils 0.2.0 jedi 0.18.0 jinja2 3.0.3 joblib 1.1.0 json5 NA jsonschema 4.2.1 jupyter_server 1.11.2 jupyterlab_server 2.8.2 kiwisolver 1.3.2 llvmlite 0.36.0 markupsafe 2.0.1 matplotlib_inline NA mpl_toolkits NA nbclassic NA nbformat 5.1.3 nbinom_ufunc NA numba 0.53.1 packaging 21.0 parso 0.8.2 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prometheus_client NA prompt_toolkit 3.0.22 ptyprocess 0.7.0 pvectorc NA pydev_ipython NA pydevconsole NA pydevd 2.6.0 pydevd_concurrency_analyser NA pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.10.0 pyparsing 3.0.6 pyrsistent NA pytz 2021.3 requests 2.26.0 scipy 1.7.2 send2trash NA six 1.16.0 slicer NA sniffio 1.2.0 socks 1.7.1 statsmodels 0.13.1 storemagic NA terminado 0.12.1 threadpoolctl 3.0.0 tornado 6.1 tqdm 4.62.3 traitlets 5.1.1 urllib3 1.26.7 wcwidth 0.2.5 websocket 0.57.0 zipp NA zmq 22.3.0
----- IPython 7.29.0 jupyter_client 7.0.6 jupyter_core 4.9.1 jupyterlab 3.2.3 notebook 6.4.5 ----- Python 3.8.12 | packaged by conda-forge | (default, Oct 12 2021, 21:50:38) [Clang 11.1.0 ] macOS-10.15.7-x86_64-i386-64bit ----- Session information updated at 2021-11-14 19:59
[4]:
if "jms_style_sheet" in plt.style.available:
plt.style.use("jms_style_sheet")
[5]:
df = pd.read_csv("diabetes.csv")
[6]:
display(df.head()), display(df.shape)
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
(768, 9)
[6]:
(None, None)
[7]:
target = "Outcome"
Mask 0 default values (seen in EDA)¶
[8]:
mask_default_values=True
[9]:
if mask_default_values:
df = (df.drop([target, "Pregnancies"], axis=1)
.replace(0, np.nan)
.merge(df[[target, "Pregnancies"]], left_index=True, right_index=True)
.dropna()
.reset_index(drop=True)
)
Plot the amount of rows in each side of the target¶
Looks like the target is imbalanced so this needs to be taken into account¶
[10]:
_ = sns.countplot(x=df[target])
print("Amount in each outcome")
df[target].value_counts()
Amount in each outcome
[10]:
0 262
1 130
Name: Outcome, dtype: int64
Resample data to correct for imbalance¶
[11]:
resample_data=True
[12]:
if resample_data:
X = df.drop(target, axis=1)
y = df[target]
sm = KMeansSMOTE(sampling_strategy="not majority",
random_state=42,
n_jobs=2)
X_res, y_res = sm.fit_resample(X, y)
print(X.shape, y.shape, X_res.shape, y_res.shape)
df = pd.concat([X_res, y_res], axis=1)
(392, 8) (392,) (525, 8) (525,)
[13]:
X = df.drop(target, axis=1)
y = df[target]
[14]:
dict_of_models
[14]:
[{'label': 'Logistic Regression', 'model': LogisticRegression()},
{'label': 'Gradient Boosting', 'model': GradientBoostingClassifier()},
{'label': 'K_Neighbors Classifier',
'model': KNeighborsClassifier(n_neighbors=3)},
{'label': 'SVM Classifier (linear)',
'model': SVC(C=0.025, kernel='linear', probability=True)},
{'label': 'SVM Classifier (Radial Basis Function; RBF)',
'model': SVC(C=1, gamma=2, probability=True)},
{'label': 'Gaussian Process Classifier',
'model': GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1))},
{'label': 'Decision Tree (depth=5)',
'model': DecisionTreeClassifier(max_depth=5)},
{'label': 'Random Forest Classifier(depth=5)',
'model': RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)},
{'label': 'Multilayer Perceptron (MLP) Classifier',
'model': MLPClassifier(alpha=1, max_iter=1000)},
{'label': 'AdaBoost Classifier', 'model': AdaBoostClassifier()},
{'label': 'Naive Bayes (Gaussian) Classifier', 'model': GaussianNB()},
{'label': 'Quadratic Discriminant Analysis Classifier',
'model': QuadraticDiscriminantAnalysis()}]
[15]:
_ = multi_roc_auc_plot(X=X,
y=y,
models=dict_of_models)
[16]:
n_features_to_select = 2 #[1, 2, 5, 10, 50, 100]
max_depth = [1, 2, 5, 10, 50, 100]
n_estimators = [1, 2, 5, 10, 50, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300]
[17]:
# Instanciate Random Forest
# rf = RandomForestClassifier(random_state = 42,
# oob_score = False) # use oob_score with many trees
rf = GradientBoostingClassifier(random_state = 42)
# Define params_dt
params_rf = {'max_depth' : max_depth,
# 'n_estimators' : n_estimators,
'max_features' : ['log2', 'auto', 'sqrt'],
# 'criterion' : ['gini', 'entropy'], #for RandomForestClassifier
'criterion' : ['friedman_mse', 'squared_error'] #for GradientBoostingClassifier
}
# Instantiate grid_dt
grid_dt = GridSearchCV(estimator = rf,
param_grid = params_rf,
scoring = 'roc_auc',
cv = 3,
n_jobs = -2)
# Optimize hyperparameter
_ = grid_dt.fit(X, y)
# Extract the best estimator
optimized_rf = grid_dt.best_estimator_
# Create the RFE with a optimized random forest
# rfe = RFE(estimator = optimized_rf,
# n_features_to_select = n_features_to_select,
# verbose = 1)
rfe = RFE(estimator = optimized_rf,
n_features_to_select = n_features_to_select,
verbose = 1)
# Fit the eliminator to the data
_ = rfe.fit(X, y)
# create dataframe with features ranking (high = dropped early on)
feature_ranking = pd.DataFrame(data = dict(zip(X.columns, rfe.ranking_)) ,
index = np.arange(0, len(X.columns)))
feature_ranking = feature_ranking.loc[0,:].sort_values()
# create dataframe with feature selected
feature_selected = X.columns[rfe.support_].to_list()
# create dataframe with importances per feature
feature_importance = pd.Series(dict(zip(X.columns, optimized_rf.feature_importances_.round(2))))
max_depth = optimized_rf.get_params()['max_depth']
max_features = optimized_rf.get_params()['max_features']
criterion = optimized_rf.get_params()['criterion']
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
[18]:
feature_importance_df = pd.DataFrame(feature_importance.sort_values(ascending=False)).reset_index().rename(columns={"index": "feature", 0: "feature_importance"})
[19]:
criterion, max_features, max_depth
[19]:
('friedman_mse', 'sqrt', 5)
[20]:
feature_importance_df
[20]:
feature | feature_importance | |
---|---|---|
0 | Glucose | 0.39 |
1 | Age | 0.17 |
2 | Insulin | 0.14 |
3 | BMI | 0.10 |
4 | DiabetesPedigreeFunction | 0.09 |
5 | SkinThickness | 0.04 |
6 | Pregnancies | 0.04 |
7 | BloodPressure | 0.03 |
[21]:
_ = plt.figure(figsize=(7, 4))
_ = sns.barplot(data=feature_importance_df, x="feature_importance", y="feature")
# _ = plt.savefig("images/feature_importances_diabetes_all.png", dpi=400, bbox_inches="tight")
[22]:
explainer = shap.Explainer(optimized_rf)
shap_values = explainer(X)
[23]:
shap.plots.beeswarm(shap_values)
[24]:
clf = optimized_rf
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X):
# select rows
train_X, test_X = X.loc[train_ix, :], X.loc[test_ix, :]
train_y, test_y = y.loc[train_ix], y.loc[test_ix]
# summarize train and test composition
train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))
_ = clf.fit(X = train_X,
y = train_y)
pred_y = clf.predict(test_X)
cf_matrix = confusion_matrix(test_y, pred_y)
_ = make_confusion_matrix(cf=cf_matrix,
title="Confusion Matrix classifying diabetes",
sum_stats=True,
cmap="magma")
_ = plt.show()
>Train: 0=201, 1=219, Test: 0=61, 1=44
>Train: 0=210, 1=210, Test: 0=52, 1=53
>Train: 0=215, 1=205, Test: 0=47, 1=58
>Train: 0=206, 1=214, Test: 0=56, 1=49
>Train: 0=216, 1=204, Test: 0=46, 1=59
[25]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
[26]:
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())
[27]:
scoring_list = ('accuracy',
'balanced_accuracy',
'f1',
'f1_weighted',
'precision',
'precision_weighted',
'recall',
'recall_weighted',
'roc_auc',
)
[28]:
tmp_out = cross_validate(optimized_rf,
X,
y,
scoring=scoring_list,
return_train_score=False,
cv=cv,
n_jobs=-1,
# fit_params={"sample_weight": sampling_weights} # fit_params is returning nans for some reason :/
)
[29]:
pd.DataFrame(tmp_out).drop(["fit_time", "score_time"], axis=1).agg(["mean", "median", "std"]).round(3)
[29]:
test_accuracy | test_balanced_accuracy | test_f1 | test_f1_weighted | test_precision | test_precision_weighted | test_recall | test_recall_weighted | test_roc_auc | |
---|---|---|---|---|---|---|---|---|---|
mean | 0.838 | 0.838 | 0.836 | 0.838 | 0.854 | 0.843 | 0.824 | 0.838 | 0.922 |
median | 0.846 | 0.846 | 0.846 | 0.846 | 0.857 | 0.851 | 0.830 | 0.846 | 0.926 |
std | 0.053 | 0.053 | 0.054 | 0.054 | 0.070 | 0.053 | 0.076 | 0.053 | 0.038 |
[30]:
cv_metrics_df = pd.DataFrame(tmp_out).drop(["fit_time", "score_time"], axis=1).melt(var_name="Metric")
[31]:
_ = plt.figure(figsize=(25,5))
_ = sns.boxplot(data = cv_metrics_df,
x = "Metric",
y = "value")
_ = sns.swarmplot(data = cv_metrics_df,
x = "Metric",
y = "value", edgecolor="white", linewidth=1)
_ = plt.title(f"All performance metrics {optimized_rf.__class__.__name__} with cross validation")
[32]:
fig = plot_learning_curve(estimator=clf,
title=f'Learning Curve of {optimized_rf.__class__.__name__} of diabetes',
X=X,
y=y,
groups=None,
cross_color=JmsColors.PURPLE,
test_color=JmsColors.YELLOW,
scoring='roc_auc',
ylim=None,
cv=None,
n_jobs=10,
train_sizes=np.linspace(.1, 1.0, 40),
figsize=(10,5))
# _ = plt.savefig(f"images/learning_curve_diabetes.png", dpi=400, bbox_inches="tight")
[33]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
# _ = optimized_rf.fit(X_train, y_train) # train the model
# y_pred = optimized_rf.predict(X_test) # predict the test data
# y_pred_proba = optimized_rf.predict_proba(X_test)[:, 1] # predict the test data
# fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1, drop_intermediate=True)
# auc_score = auc(fpr,tpr)
# _ = plt.figure(figsize=(7,6))
# _ = plt.plot(fpr, tpr)
# _ = plt.scatter(fpr, tpr)
# _ = plt.annotate(xy=(0.9, 0.1), s=f"AUC: {round(auc_score,3)}", ha="center")
# _ = plt.plot([0,1], [0, 1], c="grey", ls="--")
# _ = plt.title(f'ROC curve {str(optimized_rf).split("(")[0]} - diabetes classification')
# _ = sns.despine()
[34]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
auc_score_list = list()
# enumerate the splits and summarize the distributions
_ = plt.figure(figsize=(7,6))
for train_ix, test_ix in kfold.split(X):
# select rows
X_train, X_test = X.loc[train_ix, :], X.loc[test_ix, :]
y_train, y_test = y.loc[train_ix], y.loc[test_ix]
_ = optimized_rf.fit(X_train, y_train) # train the model
y_pred = optimized_rf.predict(X_test) # predict the test data
y_pred_proba = optimized_rf.predict_proba(X_test)[:, 1] # predict the test data
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1, drop_intermediate=True)
auc_score = auc(fpr,tpr)
print(auc_score)
auc_score_list.append(auc_score)
_ = plt.plot(fpr, tpr)
_ = plt.scatter(fpr, tpr)
_ = plt.annotate(xy=(0.9, 0.1), text=f"Mean AUC: {round(np.mean(auc_score_list),3)}", ha="center")
_ = plt.plot([0,1], [0, 1], c="grey", ls="--")
_ = plt.title(f'ROC curve {optimized_rf.__class__.__name__} - diabetes classification')
_ = sns.despine()
0.9306676449009538
0.920479302832244
0.9369038884812912
0.9309090909090908
0.9216255442670538
[35]:
X = df[feature_selected]
y = df[target]
[36]:
_ = plot_decision_boundary(X=X,
y=y,
clf=optimized_rf,
title = f'Decision Boundary - {optimized_rf.__class__.__name__}',
legend_title = "Diabetes Outcome",
h=0.5,
figsize=(10, 8))