An example of a use of groupby apply to apply a model to a set of individual groupsΒΆ

[10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
[2]:
X, y, coef = make_regression(n_samples=100,
                             n_features=10,
                             n_informative=5,
                             n_targets=1,
                             bias=0.0,
                             effective_rank=None,
                             tail_strength=0.5,
                             noise=100,
                             shuffle=True,
                             coef=True,
                             random_state=42)
[3]:
pd.DataFrame(coef, index=[f"feat_{x}" for x in range(0, coef.shape[0])]).T
[3]:
feat_0 feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9
0 16.748258 0.0 0.0 63.643025 0.0 70.647573 0.0 10.456784 3.158614 0.0
[4]:
df = (pd.DataFrame(X, columns=[f"feat_{x}" for x in range(0, X.shape[1])])
.merge(pd.DataFrame(y, columns=["target"]),
      left_index=True,
      right_index=True))
[6]:
df.shape
[6]:
(100, 11)
[8]:
df["group"] = np.repeat(range(0, 10), 10)
[12]:
df.head()
[12]:
feat_0 feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 target group
0 -0.926930 -1.430141 1.632411 -3.241267 -1.247783 -1.024388 0.130741 -0.059525 -0.252568 -0.440044 -186.494628 0
1 0.202923 0.334457 0.285865 1.547505 -0.387702 1.795878 2.010205 -1.515744 -0.612789 0.658544 191.976107 0
2 -0.241236 0.456753 0.342725 -1.251539 1.117296 1.443765 0.447709 0.352055 -0.082151 0.569767 315.503594 0
3 0.289775 -1.008086 -2.038125 0.871125 -0.408075 -0.326024 -0.351513 2.075401 1.201214 -1.870792 100.185659 0
4 -0.007973 -0.190339 -1.037246 0.077368 0.538910 -0.861284 -1.382800 1.479944 1.523124 -0.875618 -40.813080 0
[32]:
def my_super_cool_function(df, feat_list, target):
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), index=df.index, columns = df.columns)

    X = df[feat_list]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

    model = LinearRegression()
    _ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return X_test.assign(y_pred = y_pred, target=y_test)
[28]:
features_list = (df
                 .filter(regex="feat")
                 .columns
                 .tolist()
                )
[29]:
my_super_cool_function(df[df["group"]==0], feat_list = features_list, target = "target")
[29]:
feat_0 feat_1 feat_2 feat_3 feat_4 feat_5 feat_6 feat_7 feat_8 feat_9 y_pred target group
1 0.544988 0.935186 0.633147 1.000000 0.363659 1.000000 1.000000 0.127226 0.130375 0.829266 0.659026 0.753928 0.0
3 0.576900 0.223677 0.000000 0.858757 0.355044 0.302377 0.303945 1.000000 0.771786 0.000000 0.391783 0.571078 0.0
[50]:
df_results = (df
 .groupby("group")
 .apply(my_super_cool_function, features_list, "target")
)
[69]:
# (df
#  .groupby("group")
# .describe())
[55]:
def magnify():
    return [dict(selector="th",
                 props=[("font-size", "8pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]
[68]:
np.random.seed(25)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

(df_results[["y_pred", "target"]]
 .assign(difference = lambda x: x.diff(axis=1)["target"])
 .abs()
 .style.background_gradient(cmap, axis=1)
    .set_caption("Hover to magnify")
    .set_table_styles(magnify()))
[68]:
Hover to magnify
y_pred target difference
group
0 5 0.167801 0.208114 0.040312
1 0.764280 0.753928 0.010352
1 19 0.666128 0.000000 0.666128
12 0.700345 0.076113 0.624233
2 22 0.696564 0.000000 0.696564
23 1.656731 0.247903 1.408828
3 36 0.062992 0.117258 0.180250
37 0.318920 0.393723 0.074803
4 43 0.559197 0.406571 0.152626
40 0.883836 0.019515 0.864321
5 53 0.459780 0.296724 0.163055
59 0.543750 0.379198 0.164552
6 60 0.857828 0.453361 0.404467
68 0.903212 0.278278 0.624934
7 71 0.127945 0.000000 0.127945
75 0.113815 0.567656 0.681471
8 81 0.507365 0.000000 0.507365
84 0.147551 0.401466 0.549016
9 97 0.907485 1.000000 0.092515
91 0.361340 0.717797 0.356457
[42]:
# _ = plt.figure(figsize=(30, 10))
# _ = sns.heatmap(df
#                 .drop("target", axis=1)
#  .groupby("group")
# .describe())