Model analysis¶

In [1]:

            
                Copied!
                
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [2]:

            
                Copied!
                
                    
                    
                
                

        
from churn_pred.utils import dill_load
from churn_pred.training.utils import get_feature_importance
from churn_pred.eda.target.analysis import correlation
from churn_pred.eda.target.plotting import prob_distrib_per_class
from sklearn.model_selection import train_test_split
from pprint import pprint
import lightgbm as lgb
from churn_pred.training.utils import to_lgbdataset
import shap
import warnings
import numpy as np
from churn_pred.utils import dill_load
from churn_pred.training.utils import get_feature_importance
from churn_pred.eda.target.analysis import correlation
from churn_pred.eda.target.plotting import prob_distrib_per_class
from sklearn.model_selection import train_test_split
from pprint import pprint
import lightgbm as lgb
from churn_pred.training.utils import to_lgbdataset
import shap
import warnings
import numpy as np

In [3]:

            
                Copied!
                
trainer = dill_load("data/model/lgbm_trainer.dill")
trainer = dill_load("data/model/lgbm_trainer.dill")

In [4]:

            
                Copied!
                
                    
                    
                
                

        
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

valid_size = 0.2
test_size = 0.5

random_state = 1
df_train, df_valid = train_test_split(
    df_pd,
    test_size=valid_size,
    stratify=df_pd[trainer.target_col],
    random_state=random_state,
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[trainer.target_col],
    random_state=random_state,
)
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

valid_size = 0.2
test_size = 0.5

random_state = 1
df_train, df_valid = train_test_split(
    df_pd,
    test_size=valid_size,
    stratify=df_pd[trainer.target_col],
    random_state=random_state,
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[trainer.target_col],
    random_state=random_state,
)

In [5]:

            
                Copied!
                
df_predicted_proba = trainer.predict_proba(
    df=df_test.drop(columns=trainer.target_col), binary2d=False
)
df_predicted_cls = trainer.predict_cls(df=df_test.drop(columns=trainer.target_col))
df_predicted_proba = trainer.predict_proba(
    df=df_test.drop(columns=trainer.target_col), binary2d=False
)
df_predicted_cls = trainer.predict_cls(df=df_test.drop(columns=trainer.target_col))

Model feature importance and "certainty" of the model in predicting classes - probability density per class¶

In [6]:

            
                Copied!
                
lgb.plot_importance(trainer.model, importance_type="split")
# lgb.plot_importance(trainer.model, importance_type="gain")
lgb.plot_importance(trainer.model, importance_type="split")
# lgb.plot_importance(trainer.model, importance_type="gain")

Out[6]:

<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>

In [7]:

            
                Copied!
                
fig = prob_distrib_per_class(
    predicted_probs=df_predicted_proba.values.flatten(),
    actual=df_test[trainer.target_col].values.flatten(),
    task="binary",
)
fig = prob_distrib_per_class(
    predicted_probs=df_predicted_proba.values.flatten(),
    actual=df_test[trainer.target_col].values.flatten(),
    task="binary",
)

SHAP analysis¶

In [8]:

            
                Copied!
                
                    
                    
                
                

        
df_test_prep = trainer.preprocessors[0].transform(
    df_test.drop(columns=[trainer.target_col])
)
df_test_prep[trainer.target_col] = df_test[trainer.target_col].astype(int)

lgb_test, _ = to_lgbdataset(
    train=df_test_prep,
    cat_cols=trainer.cat_cols,
    target_col=trainer.target_col,
    id_cols=trainer.id_cols,
)
df_test_prep = trainer.preprocessors[0].transform(
    df_test.drop(columns=[trainer.target_col])
)
df_test_prep[trainer.target_col] = df_test[trainer.target_col].astype(int)

lgb_test, _ = to_lgbdataset(
    train=df_test_prep,
    cat_cols=trainer.cat_cols,
    target_col=trainer.target_col,
    id_cols=trainer.id_cols,
)

In [9]:

            
                Copied!
                
base_value = trainer.model.predict(data=lgb_test.data, pred_contrib=True)
shap_values = base_value[:, :-1]
base_value = np.mean(base_value[:, -1])
base_value = trainer.model.predict(data=lgb_test.data, pred_contrib=True)
shap_values = base_value[:, :-1]
base_value = np.mean(base_value[:, -1])

In [10]:

            
                Copied!
                
shap.summary_plot(
    shap_values, features=lgb_test.data, plot_type="dot", sort=False, show=True
)
shap.summary_plot(
    shap_values, features=lgb_test.data, plot_type="dot", sort=False, show=True
)

In [11]:

            
                Copied!
                
df_predicted_cls[28:30]
df_predicted_cls[28:30]

Out[11]:

	Exited
28	0
29	1

In [12]:

            
                Copied!
                
df_test.iloc[28:30]
df_test.iloc[28:30]

Out[12]:

	CustomerId	CreditScore	Country	Gender	Age	Tenure	Balance (EUR)	NumberOfProducts	HasCreditCard	IsActiveMember	EstimatedSalary	Exited	CustomerFeedback_sentiment3	CustomerFeedback_sentiment5	Surname_Country	Surname_Country_region	Surname_Country_subregion	Country_region	Country_subregion	is_native	Country_hemisphere	Country_gdp_per_capita	Country_IncomeGroup	Surname_Country_gdp_per_capita	Surname_Country_IncomeGroup	working_class	stage_of_life	generation
9941	15714240	712	Spain	Male	74	5	0.0	2	0	0	151425.82	0	neutral	2 stars	Russian Federation	Europe	Eastern Europe	Europe	Southern Europe	0	northern	48685.49631	High income	34637.76172	Upper middle income	elderly	senior_adult	boomers_1
7444	15658057	812	Spain	Female	44	8	0.0	3	1	0	66926.83	1	neutral	1 star	Italy	Europe	Southern Europe	Europe	Southern Europe	0	northern	48685.49631	High income	55442.07843	High income	working_age	middle_age_adult	millennials

In [13]:

            
                Copied!
                
                    
                    
                
                

        
shap_decision_plot = shap.decision_plot(
    base_value=base_value,
    shap_values=shap_values[28:30],
    features=trainer.model.feature_name(),
    feature_names=trainer.model.feature_name(),
    # legend_labels=legend_labels,
    link="logit",
    # row_index=0,
    # feature_order=list(range(len(trainer.model.feature_name()))),
)
shap_decision_plot = shap.decision_plot(
    base_value=base_value,
    shap_values=shap_values[28:30],
    features=trainer.model.feature_name(),
    feature_names=trainer.model.feature_name(),
    # legend_labels=legend_labels,
    link="logit",
    # row_index=0,
    # feature_order=list(range(len(trainer.model.feature_name()))),
)