An example of a use of groupby apply to apply a model to a set of individual groups¶

[10]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

[2]:

X, y, coef = make_regression(n_samples=100,
                             n_features=10,
                             n_informative=5,
                             n_targets=1,
                             bias=0.0,
                             effective_rank=None,
                             tail_strength=0.5,
                             noise=100,
                             shuffle=True,
                             coef=True,
                             random_state=42)

[3]:

pd.DataFrame(coef, index=[f"feat_{x}" for x in range(0, coef.shape[0])]).T

[3]:

	feat_0	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9
0	16.748258	0.0	0.0	63.643025	0.0	70.647573	0.0	10.456784	3.158614	0.0

[4]:

df = (pd.DataFrame(X, columns=[f"feat_{x}" for x in range(0, X.shape[1])])
.merge(pd.DataFrame(y, columns=["target"]),
      left_index=True,
      right_index=True))

[6]:

df.shape

[6]:

(100, 11)

[8]:

df["group"] = np.repeat(range(0, 10), 10)

[12]:

df.head()

[12]:

	feat_0	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	target
0	-0.926930	-1.430141	1.632411	-3.241267	-1.247783	-1.024388	0.130741	-0.059525	-0.252568	-0.440044	-186.494628
1	0.202923	0.334457	0.285865	1.547505	-0.387702	1.795878	2.010205	-1.515744	-0.612789	0.658544	191.976107
2	-0.241236	0.456753	0.342725	-1.251539	1.117296	1.443765	0.447709	0.352055	-0.082151	0.569767	315.503594
3	0.289775	-1.008086	-2.038125	0.871125	-0.408075	-0.326024	-0.351513	2.075401	1.201214	-1.870792	100.185659
4	-0.007973	-0.190339	-1.037246	0.077368	0.538910	-0.861284	-1.382800	1.479944	1.523124	-0.875618	-40.813080

[32]:

def my_super_cool_function(df, feat_list, target):
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), index=df.index, columns = df.columns)

    X = df[feat_list]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

    model = LinearRegression()
    _ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return X_test.assign(y_pred = y_pred, target=y_test)

[28]:

features_list = (df
                 .filter(regex="feat")
                 .columns
                 .tolist()
                )

[29]:

my_super_cool_function(df[df["group"]==0], feat_list = features_list, target = "target")

[29]:

	feat_0	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	y_pred	target	group
1	0.544988	0.935186	0.633147	1.000000	0.363659	1.000000	1.000000	0.127226	0.130375	0.829266	0.659026	0.753928	0.0
3	0.576900	0.223677	0.000000	0.858757	0.355044	0.302377	0.303945	1.000000	0.771786	0.000000	0.391783	0.571078	0.0

[50]:

df_results = (df
 .groupby("group")
 .apply(my_super_cool_function, features_list, "target")
)

[69]:

# (df
#  .groupby("group")
# .describe())

[55]:

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "8pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

[68]:

np.random.seed(25)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

(df_results[["y_pred", "target"]]
 .assign(difference = lambda x: x.diff(axis=1)["target"])
 .abs()
 .style.background_gradient(cmap, axis=1)
    .set_caption("Hover to magnify")
    .set_table_styles(magnify()))

[68]:

Hover to magnify
		y_pred	target	difference
group
0	5	0.167801	0.208114	0.040312
0	1	0.764280	0.753928	0.010352
1	19	0.666128	0.000000	0.666128
1	12	0.700345	0.076113	0.624233
2	22	0.696564	0.000000	0.696564
2	23	1.656731	0.247903	1.408828
3	36	0.062992	0.117258	0.180250
3	37	0.318920	0.393723	0.074803
4	43	0.559197	0.406571	0.152626
4	40	0.883836	0.019515	0.864321
5	53	0.459780	0.296724	0.163055
5	59	0.543750	0.379198	0.164552
6	60	0.857828	0.453361	0.404467
6	68	0.903212	0.278278	0.624934
7	71	0.127945	0.000000	0.127945
7	75	0.113815	0.567656	0.681471
8	81	0.507365	0.000000	0.507365
8	84	0.147551	0.401466	0.549016
9	97	0.907485	1.000000	0.092515
9	91	0.361340	0.717797	0.356457

[42]:

# _ = plt.figure(figsize=(30, 10))
# _ = sns.heatmap(df
#                 .drop("target", axis=1)
#  .groupby("group")
# .describe())