Feature analysis¶

Purpose:

analyse dataset features without inclusion of target value
- if it looks healthy
  - duplicate values
  - NA values
  - zero/0 values
- if the values are "too constant", ie. most [robably bad features for classifcation/regression
  - small std values
  - entropy is too high/too low(not sure right now?) - "Entropy is a measure of disorder or uncertainty"
  - number of unique values in categorical features is too high - most probably they are unable to identify any group of targets
- drop highly correlated values
  - to improve performance of the model training
  - to decrease dataset size for further exploration

Imports¶

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

# import plotly.graph_objects as go
from churn_pred.eda.features.analysis import (
    missing,
    zero,
    nunique,
    init_check,
    std,
    entropy,
    nunique,
)

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from churn_pred.eda.features.plotting import cross_correlation, distributions
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

# import plotly.graph_objects as go
from churn_pred.eda.features.analysis import (
    missing,
    zero,
    nunique,
    init_check,
    std,
    entropy,
    nunique,
)

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from churn_pred.eda.features.plotting import cross_correlation, distributions

Dataset¶

In [2]:

            
                Copied!
                
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()

Out[2]:

	CustomerId	CreditScore	Country	Gender	Age	Tenure	Balance (EUR)	NumberOfProducts	HasCreditCard	IsActiveMember	EstimatedSalary	CustomerFeedback_sentiment3	CustomerFeedback_sentiment5	Surname_Country	Surname_Country_region	Surname_Country_subregion	Country_region	Country_subregion	Country_hemisphere	Country_gdp_per_capita	Country_IncomeGroup	Surname_Country_gdp_per_capita	Surname_Country_IncomeGroup	working_class	stage_of_life	generation
0	15787619	844	France	Male	18	2	160980.03	1	0	0	145936.28	neutral	4 stars	Taiwan	Asia	Eastern Asia	Europe	Western Europe	northern	57594.03402	High income	32756.00000	None	working_age	teen	gen_z
1	15770309	656	France	Male	18	10	151762.74	1	0	1	127014.32	neutral	1 star	United States	Americas	Northern America	Europe	Western Europe	northern	57594.03402	High income	76329.58227	High income	working_age	teen	gen_z
2	15569178	570	France	Female	18	4	82767.42	1	1	0	71811.90	neutral	2 stars	Russian Federation	Europe	Eastern Europe	Europe	Western Europe	northern	57594.03402	High income	34637.76172	Upper middle income	working_age	teen	gen_z
3	15795519	716	Germany	Female	18	3	128743.80	1	0	0	197322.13	neutral	2 stars	Russian Federation	Europe	Eastern Europe	Europe	Western Europe	northern	66616.02225	High income	34637.76172	Upper middle income	working_age	teen	gen_z
4	15621893	727	France	Male	18	4	133550.67	1	1	1	46941.41	positive	1 star	Italy	Europe	Southern Europe	Europe	Western Europe	northern	57594.03402	High income	55442.07843	High income	working_age	teen	gen_z

In [3]:

            
                Copied!
                
                    
                    
                
                

        
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()

In [4]:

            
                Copied!
                
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

Features¶

Analysis¶

In [5]:

            
                Copied!
                
                    
                    
                
                

        
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
    df=df_pd,
    identifier=id_cols,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
    verbose=True,
)

# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
    df=df_pd,
    identifier=id_cols,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
    verbose=True,
)

# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()

[CHECK] Number of duplicated ids: 0
[CHECK] Numerical columns
                                  count       mean       std       min  \
Age                             10000.0      38.92     10.49     18.00   
EstimatedSalary                 10000.0  100090.24  57510.49     11.58   
Balance (EUR)                   10000.0   76485.89  62397.41      0.00   
CreditScore                     10000.0     650.53     96.65    350.00   
Tenure                          10000.0       5.01      2.89      0.00   
Country_gdp_per_capita          10000.0   57651.01   6330.96  48685.50   
NumberOfProducts                10000.0       1.53      0.58      1.00   
Surname_Country_gdp_per_capita  10000.0   51228.52  25976.50   1337.83   

                                     25%        50%        75%        max  
Age                                32.00      37.00      44.00      92.00  
EstimatedSalary                 51002.11  100193.91  149388.25  199992.48  
Balance (EUR)                       0.00   97198.54  127644.24  250898.09  
CreditScore                       584.00     652.00     718.00     850.00  
Tenure                              3.00       5.00       7.00      10.00  
Country_gdp_per_capita          57594.03   57594.03   66616.02   66616.02  
NumberOfProducts                    1.00       1.00       2.00       4.00  
Surname_Country_gdp_per_capita  33525.30   57460.51   76329.58  133822.76  
[CHECK] Categorical columns
                             count unique               top   freq
Gender                       10000      2              Male   5457
Surname_Country_subregion    10000     18  Northern America   2410
Country_subregion            10000      2    Western Europe   7523
Country_hemisphere           10000      1          northern  10000
Country_region               10000      1            Europe  10000
is_native                    10000      2                 0   9914
Surname_Country              10000     89     United States   2401
CustomerFeedback_sentiment3  10000      3           neutral   8352
Surname_Country_region       10000      4            Europe   4392
working_class                10000      2       working_age   9736
stage_of_life                10000      4             adult   6330
IsActiveMember               10000      2                 1   5151
CustomerFeedback_sentiment5  10000      5           2 stars   7121
HasCreditCard                10000      2                 1   7055
generation                   10000      6       millennials   6367
Country_IncomeGroup          10000      1       High income  10000
Surname_Country_IncomeGroup  10000      5       High income   6686
Country                      10000      3            France   5014

In [6]:

            
                Copied!
                
missing_val_frac, fig = missing(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

missing_val_frac.head()
missing_val_frac, fig = missing(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

missing_val_frac.head()

Out[6]:

CreditScore                    0.0
Surname_Country_region         0.0
stage_of_life                  0.0
working_class                  0.0
Surname_Country_IncomeGroup    0.0
dtype: float64

In [7]:

            
                Copied!
                
zero_val_frac, fig = zero(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

zero_val_frac.head()
zero_val_frac, fig = zero(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

zero_val_frac.head()

Out[7]:

Exited                       0.7963
Balance (EUR)                0.3617
Tenure                       0.0413
CreditScore                  0.0000
Surname_Country_subregion    0.0000
dtype: float64

In [8]:

            
                Copied!
                
data_nunique, fig = nunique(
    df=df_pd.drop(columns=id_cols),
    scale="log",
    plot=True,
)

data_nunique.head()
data_nunique, fig = nunique(
    df=df_pd.drop(columns=id_cols),
    scale="log",
    plot=True,
)

data_nunique.head()

Out[8]:

Country_IncomeGroup    1
Country_hemisphere     1
Country_region         1
IsActiveMember         2
is_native              2
dtype: int64

In [9]:

            
                Copied!
                
data_std, fig = std(
    df=df_pd[cont_cols],
    scale="log",
    plot=True,
)

data_std.head()
data_std, fig = std(
    df=df_pd[cont_cols],
    scale="log",
    plot=True,
)

data_std.head()

Out[9]:

Balance (EUR)                     62397.405202
EstimatedSalary                   57510.492818
Surname_Country_gdp_per_capita    25976.502611
Country_gdp_per_capita             6330.960841
CreditScore                          96.653299
dtype: float64

In [10]:

            
                Copied!
                
col_entropies, fig = entropy(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

col_entropies.head()
col_entropies, fig = entropy(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

col_entropies.head()

Out[10]:

EstimatedSalary    9.210202
Balance (EUR)      6.246510
CreditScore        5.873574
Age                3.677614
Surname_Country    2.481289
dtype: float64

Plotting¶

In [11]:

            
                Copied!
                
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)

Top 10 absolute correlations
Balance (EUR)     Country_gdp_per_capita            0.329762
                  NumberOfProducts                  0.304180
Age               NumberOfProducts                  0.030680
                  Country_gdp_per_capita            0.029999
                  Balance (EUR)                     0.028308
Tenure            Surname_Country_gdp_per_capita    0.017138
NumberOfProducts  EstimatedSalary                   0.014204
Tenure            NumberOfProducts                  0.013444
Balance (EUR)     EstimatedSalary                   0.012797
Tenure            Balance (EUR)                     0.012254
dtype: float64

In [12]:

            
                Copied!
                
                    
                    
                
                

        
fig = distributions(
    df=df_pd[cont_cols],
    low_per_cut=0,
    high_per_cut=1,
    type="box",
)
fig = distributions(
    df=df_pd[cont_cols],
    low_per_cut=0,
    high_per_cut=1,
    type="box",
)