Model analysis¶
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
In [2]:
Copied!
from churn_pred.utils import dill_load
from churn_pred.training.utils import get_feature_importance
from churn_pred.eda.target.analysis import correlation
from churn_pred.eda.target.plotting import prob_distrib_per_class
from sklearn.model_selection import train_test_split
from pprint import pprint
import lightgbm as lgb
from churn_pred.training.utils import to_lgbdataset
import shap
import warnings
import numpy as np
from churn_pred.utils import dill_load
from churn_pred.training.utils import get_feature_importance
from churn_pred.eda.target.analysis import correlation
from churn_pred.eda.target.plotting import prob_distrib_per_class
from sklearn.model_selection import train_test_split
from pprint import pprint
import lightgbm as lgb
from churn_pred.training.utils import to_lgbdataset
import shap
import warnings
import numpy as np
In [3]:
Copied!
trainer = dill_load("data/model/lgbm_trainer.dill")
trainer = dill_load("data/model/lgbm_trainer.dill")
In [4]:
Copied!
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
cat_cols = [
"Country",
"Gender",
"HasCreditCard",
"IsActiveMember",
"CustomerFeedback_sentiment3",
"CustomerFeedback_sentiment5",
"Surname_Country",
"Surname_Country_region",
"Surname_Country_subregion",
"Country_region",
"Country_subregion",
"is_native",
"Country_hemisphere",
"Country_IncomeGroup",
"Surname_Country_IncomeGroup",
"working_class",
"stage_of_life",
"generation",
]
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
df_pd,
test_size=valid_size,
stratify=df_pd[trainer.target_col],
random_state=random_state,
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[trainer.target_col],
random_state=random_state,
)
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
cat_cols = [
"Country",
"Gender",
"HasCreditCard",
"IsActiveMember",
"CustomerFeedback_sentiment3",
"CustomerFeedback_sentiment5",
"Surname_Country",
"Surname_Country_region",
"Surname_Country_subregion",
"Country_region",
"Country_subregion",
"is_native",
"Country_hemisphere",
"Country_IncomeGroup",
"Surname_Country_IncomeGroup",
"working_class",
"stage_of_life",
"generation",
]
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
df_pd,
test_size=valid_size,
stratify=df_pd[trainer.target_col],
random_state=random_state,
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[trainer.target_col],
random_state=random_state,
)
In [5]:
Copied!
df_predicted_proba = trainer.predict_proba(
df=df_test.drop(columns=trainer.target_col), binary2d=False
)
df_predicted_cls = trainer.predict_cls(df=df_test.drop(columns=trainer.target_col))
df_predicted_proba = trainer.predict_proba(
df=df_test.drop(columns=trainer.target_col), binary2d=False
)
df_predicted_cls = trainer.predict_cls(df=df_test.drop(columns=trainer.target_col))
Model feature importance and "certainty" of the model in predicting classes - probability density per class¶
In [6]:
Copied!
lgb.plot_importance(trainer.model, importance_type="split")
# lgb.plot_importance(trainer.model, importance_type="gain")
lgb.plot_importance(trainer.model, importance_type="split")
# lgb.plot_importance(trainer.model, importance_type="gain")
Out[6]:
<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>
In [7]:
Copied!
fig = prob_distrib_per_class(
predicted_probs=df_predicted_proba.values.flatten(),
actual=df_test[trainer.target_col].values.flatten(),
task="binary",
)
fig = prob_distrib_per_class(
predicted_probs=df_predicted_proba.values.flatten(),
actual=df_test[trainer.target_col].values.flatten(),
task="binary",
)
SHAP analysis¶
In [8]:
Copied!
df_test_prep = trainer.preprocessors[0].transform(
df_test.drop(columns=[trainer.target_col])
)
df_test_prep[trainer.target_col] = df_test[trainer.target_col].astype(int)
lgb_test, _ = to_lgbdataset(
train=df_test_prep,
cat_cols=trainer.cat_cols,
target_col=trainer.target_col,
id_cols=trainer.id_cols,
)
df_test_prep = trainer.preprocessors[0].transform(
df_test.drop(columns=[trainer.target_col])
)
df_test_prep[trainer.target_col] = df_test[trainer.target_col].astype(int)
lgb_test, _ = to_lgbdataset(
train=df_test_prep,
cat_cols=trainer.cat_cols,
target_col=trainer.target_col,
id_cols=trainer.id_cols,
)
In [9]:
Copied!
base_value = trainer.model.predict(data=lgb_test.data, pred_contrib=True)
shap_values = base_value[:, :-1]
base_value = np.mean(base_value[:, -1])
base_value = trainer.model.predict(data=lgb_test.data, pred_contrib=True)
shap_values = base_value[:, :-1]
base_value = np.mean(base_value[:, -1])
In [10]:
Copied!
shap.summary_plot(
shap_values, features=lgb_test.data, plot_type="dot", sort=False, show=True
)
shap.summary_plot(
shap_values, features=lgb_test.data, plot_type="dot", sort=False, show=True
)
In [11]:
Copied!
df_predicted_cls[28:30]
df_predicted_cls[28:30]
Out[11]:
Exited | |
---|---|
28 | 0 |
29 | 1 |
In [12]:
Copied!
df_test.iloc[28:30]
df_test.iloc[28:30]
Out[12]:
CustomerId | CreditScore | Country | Gender | Age | Tenure | Balance (EUR) | NumberOfProducts | HasCreditCard | IsActiveMember | EstimatedSalary | Exited | CustomerFeedback_sentiment3 | CustomerFeedback_sentiment5 | Surname_Country | Surname_Country_region | Surname_Country_subregion | Country_region | Country_subregion | is_native | Country_hemisphere | Country_gdp_per_capita | Country_IncomeGroup | Surname_Country_gdp_per_capita | Surname_Country_IncomeGroup | working_class | stage_of_life | generation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9941 | 15714240 | 712 | Spain | Male | 74 | 5 | 0.0 | 2 | 0 | 0 | 151425.82 | 0 | neutral | 2 stars | Russian Federation | Europe | Eastern Europe | Europe | Southern Europe | 0 | northern | 48685.49631 | High income | 34637.76172 | Upper middle income | elderly | senior_adult | boomers_1 |
7444 | 15658057 | 812 | Spain | Female | 44 | 8 | 0.0 | 3 | 1 | 0 | 66926.83 | 1 | neutral | 1 star | Italy | Europe | Southern Europe | Europe | Southern Europe | 0 | northern | 48685.49631 | High income | 55442.07843 | High income | working_age | middle_age_adult | millennials |
In [13]:
Copied!
shap_decision_plot = shap.decision_plot(
base_value=base_value,
shap_values=shap_values[28:30],
features=trainer.model.feature_name(),
feature_names=trainer.model.feature_name(),
# legend_labels=legend_labels,
link="logit",
# row_index=0,
# feature_order=list(range(len(trainer.model.feature_name()))),
)
shap_decision_plot = shap.decision_plot(
base_value=base_value,
shap_values=shap_values[28:30],
features=trainer.model.feature_name(),
feature_names=trainer.model.feature_name(),
# legend_labels=legend_labels,
link="logit",
# row_index=0,
# feature_order=list(range(len(trainer.model.feature_name()))),
)