An example of a use of groupby apply to apply a model to a set of individual groupsΒΆ
[10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
[2]:
X, y, coef = make_regression(n_samples=100,
n_features=10,
n_informative=5,
n_targets=1,
bias=0.0,
effective_rank=None,
tail_strength=0.5,
noise=100,
shuffle=True,
coef=True,
random_state=42)
[3]:
pd.DataFrame(coef, index=[f"feat_{x}" for x in range(0, coef.shape[0])]).T
[3]:
feat_0 | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 16.748258 | 0.0 | 0.0 | 63.643025 | 0.0 | 70.647573 | 0.0 | 10.456784 | 3.158614 | 0.0 |
[4]:
df = (pd.DataFrame(X, columns=[f"feat_{x}" for x in range(0, X.shape[1])])
.merge(pd.DataFrame(y, columns=["target"]),
left_index=True,
right_index=True))
[6]:
df.shape
[6]:
(100, 11)
[8]:
df["group"] = np.repeat(range(0, 10), 10)
[12]:
df.head()
[12]:
feat_0 | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | target | group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.926930 | -1.430141 | 1.632411 | -3.241267 | -1.247783 | -1.024388 | 0.130741 | -0.059525 | -0.252568 | -0.440044 | -186.494628 | 0 |
1 | 0.202923 | 0.334457 | 0.285865 | 1.547505 | -0.387702 | 1.795878 | 2.010205 | -1.515744 | -0.612789 | 0.658544 | 191.976107 | 0 |
2 | -0.241236 | 0.456753 | 0.342725 | -1.251539 | 1.117296 | 1.443765 | 0.447709 | 0.352055 | -0.082151 | 0.569767 | 315.503594 | 0 |
3 | 0.289775 | -1.008086 | -2.038125 | 0.871125 | -0.408075 | -0.326024 | -0.351513 | 2.075401 | 1.201214 | -1.870792 | 100.185659 | 0 |
4 | -0.007973 | -0.190339 | -1.037246 | 0.077368 | 0.538910 | -0.861284 | -1.382800 | 1.479944 | 1.523124 | -0.875618 | -40.813080 | 0 |
[32]:
def my_super_cool_function(df, feat_list, target):
df = pd.DataFrame(MinMaxScaler().fit_transform(df), index=df.index, columns = df.columns)
X = df[feat_list]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
model = LinearRegression()
_ = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return X_test.assign(y_pred = y_pred, target=y_test)
[28]:
features_list = (df
.filter(regex="feat")
.columns
.tolist()
)
[29]:
my_super_cool_function(df[df["group"]==0], feat_list = features_list, target = "target")
[29]:
feat_0 | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | y_pred | target | group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.544988 | 0.935186 | 0.633147 | 1.000000 | 0.363659 | 1.000000 | 1.000000 | 0.127226 | 0.130375 | 0.829266 | 0.659026 | 0.753928 | 0.0 |
3 | 0.576900 | 0.223677 | 0.000000 | 0.858757 | 0.355044 | 0.302377 | 0.303945 | 1.000000 | 0.771786 | 0.000000 | 0.391783 | 0.571078 | 0.0 |
[50]:
df_results = (df
.groupby("group")
.apply(my_super_cool_function, features_list, "target")
)
[69]:
# (df
# .groupby("group")
# .describe())
[55]:
def magnify():
return [dict(selector="th",
props=[("font-size", "8pt")]),
dict(selector="td",
props=[('padding', "0em 0em")]),
dict(selector="th:hover",
props=[("font-size", "12pt")]),
dict(selector="tr:hover td:hover",
props=[('max-width', '200px'),
('font-size', '12pt')])
]
[68]:
np.random.seed(25)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
(df_results[["y_pred", "target"]]
.assign(difference = lambda x: x.diff(axis=1)["target"])
.abs()
.style.background_gradient(cmap, axis=1)
.set_caption("Hover to magnify")
.set_table_styles(magnify()))
[68]:
y_pred | target | difference | ||
---|---|---|---|---|
group | ||||
0 | 5 | 0.167801 | 0.208114 | 0.040312 |
1 | 0.764280 | 0.753928 | 0.010352 | |
1 | 19 | 0.666128 | 0.000000 | 0.666128 |
12 | 0.700345 | 0.076113 | 0.624233 | |
2 | 22 | 0.696564 | 0.000000 | 0.696564 |
23 | 1.656731 | 0.247903 | 1.408828 | |
3 | 36 | 0.062992 | 0.117258 | 0.180250 |
37 | 0.318920 | 0.393723 | 0.074803 | |
4 | 43 | 0.559197 | 0.406571 | 0.152626 |
40 | 0.883836 | 0.019515 | 0.864321 | |
5 | 53 | 0.459780 | 0.296724 | 0.163055 |
59 | 0.543750 | 0.379198 | 0.164552 | |
6 | 60 | 0.857828 | 0.453361 | 0.404467 |
68 | 0.903212 | 0.278278 | 0.624934 | |
7 | 71 | 0.127945 | 0.000000 | 0.127945 |
75 | 0.113815 | 0.567656 | 0.681471 | |
8 | 81 | 0.507365 | 0.000000 | 0.507365 |
84 | 0.147551 | 0.401466 | 0.549016 | |
9 | 97 | 0.907485 | 1.000000 | 0.092515 |
91 | 0.361340 | 0.717797 | 0.356457 |
[42]:
# _ = plt.figure(figsize=(30, 10))
# _ = sns.heatmap(df
# .drop("target", axis=1)
# .groupby("group")
# .describe())