Feature analysis¶
Purpose:
- analyse dataset features without inclusion of target value
- if it looks healthy
- duplicate values
- NA values
- zero/0 values
- if the values are "too constant", ie. most [robably bad features for classifcation/regression
- small std values
- entropy is too high/too low(not sure right now?) - "Entropy is a measure of disorder or uncertainty"
- number of unique values in categorical features is too high - most probably they are unable to identify any group of targets
- drop highly correlated values
- to improve performance of the model training
- to decrease dataset size for further exploration
- if it looks healthy
Imports¶
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
# import plotly.graph_objects as go
from churn_pred.eda.features.analysis import (
missing,
zero,
nunique,
init_check,
std,
entropy,
nunique,
)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from churn_pred.eda.features.plotting import cross_correlation, distributions
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
# import plotly.graph_objects as go
from churn_pred.eda.features.analysis import (
missing,
zero,
nunique,
init_check,
std,
entropy,
nunique,
)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from churn_pred.eda.features.plotting import cross_correlation, distributions
Dataset¶
In [2]:
Copied!
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()
Out[2]:
CustomerId | CreditScore | Country | Gender | Age | Tenure | Balance (EUR) | NumberOfProducts | HasCreditCard | IsActiveMember | EstimatedSalary | Exited | CustomerFeedback_sentiment3 | CustomerFeedback_sentiment5 | Surname_Country | Surname_Country_region | Surname_Country_subregion | Country_region | Country_subregion | is_native | Country_hemisphere | Country_gdp_per_capita | Country_IncomeGroup | Surname_Country_gdp_per_capita | Surname_Country_IncomeGroup | working_class | stage_of_life | generation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15787619 | 844 | France | Male | 18 | 2 | 160980.03 | 1 | 0 | 0 | 145936.28 | 0 | neutral | 4 stars | Taiwan | Asia | Eastern Asia | Europe | Western Europe | 0 | northern | 57594.03402 | High income | 32756.00000 | None | working_age | teen | gen_z |
1 | 15770309 | 656 | France | Male | 18 | 10 | 151762.74 | 1 | 0 | 1 | 127014.32 | 0 | neutral | 1 star | United States | Americas | Northern America | Europe | Western Europe | 0 | northern | 57594.03402 | High income | 76329.58227 | High income | working_age | teen | gen_z |
2 | 15569178 | 570 | France | Female | 18 | 4 | 82767.42 | 1 | 1 | 0 | 71811.90 | 0 | neutral | 2 stars | Russian Federation | Europe | Eastern Europe | Europe | Western Europe | 0 | northern | 57594.03402 | High income | 34637.76172 | Upper middle income | working_age | teen | gen_z |
3 | 15795519 | 716 | Germany | Female | 18 | 3 | 128743.80 | 1 | 0 | 0 | 197322.13 | 0 | neutral | 2 stars | Russian Federation | Europe | Eastern Europe | Europe | Western Europe | 0 | northern | 66616.02225 | High income | 34637.76172 | Upper middle income | working_age | teen | gen_z |
4 | 15621893 | 727 | France | Male | 18 | 4 | 133550.67 | 1 | 1 | 1 | 46941.41 | 0 | positive | 1 star | Italy | Europe | Southern Europe | Europe | Western Europe | 0 | northern | 57594.03402 | High income | 55442.07843 | High income | working_age | teen | gen_z |
In [3]:
Copied!
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
"Country",
"Gender",
"HasCreditCard",
"IsActiveMember",
"CustomerFeedback_sentiment3",
"CustomerFeedback_sentiment5",
"Surname_Country",
"Surname_Country_region",
"Surname_Country_subregion",
"Country_region",
"Country_subregion",
"is_native",
"Country_hemisphere",
"Country_IncomeGroup",
"Surname_Country_IncomeGroup",
"working_class",
"stage_of_life",
"generation",
]
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
"Country",
"Gender",
"HasCreditCard",
"IsActiveMember",
"CustomerFeedback_sentiment3",
"CustomerFeedback_sentiment5",
"Surname_Country",
"Surname_Country_region",
"Surname_Country_subregion",
"Country_region",
"Country_subregion",
"is_native",
"Country_hemisphere",
"Country_IncomeGroup",
"Surname_Country_IncomeGroup",
"working_class",
"stage_of_life",
"generation",
]
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
In [4]:
Copied!
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
Features¶
Analysis¶
In [5]:
Copied!
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
df=df_pd,
identifier=id_cols,
cat_cols=cat_cols,
cont_cols=cont_cols,
verbose=True,
)
# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
df=df_pd,
identifier=id_cols,
cat_cols=cat_cols,
cont_cols=cont_cols,
verbose=True,
)
# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
[CHECK] Number of duplicated ids: 0 [CHECK] Numerical columns count mean std min \ Age 10000.0 38.92 10.49 18.00 EstimatedSalary 10000.0 100090.24 57510.49 11.58 Balance (EUR) 10000.0 76485.89 62397.41 0.00 CreditScore 10000.0 650.53 96.65 350.00 Tenure 10000.0 5.01 2.89 0.00 Country_gdp_per_capita 10000.0 57651.01 6330.96 48685.50 NumberOfProducts 10000.0 1.53 0.58 1.00 Surname_Country_gdp_per_capita 10000.0 51228.52 25976.50 1337.83 25% 50% 75% max Age 32.00 37.00 44.00 92.00 EstimatedSalary 51002.11 100193.91 149388.25 199992.48 Balance (EUR) 0.00 97198.54 127644.24 250898.09 CreditScore 584.00 652.00 718.00 850.00 Tenure 3.00 5.00 7.00 10.00 Country_gdp_per_capita 57594.03 57594.03 66616.02 66616.02 NumberOfProducts 1.00 1.00 2.00 4.00 Surname_Country_gdp_per_capita 33525.30 57460.51 76329.58 133822.76 [CHECK] Categorical columns count unique top freq Gender 10000 2 Male 5457 Surname_Country_subregion 10000 18 Northern America 2410 Country_subregion 10000 2 Western Europe 7523 Country_hemisphere 10000 1 northern 10000 Country_region 10000 1 Europe 10000 is_native 10000 2 0 9914 Surname_Country 10000 89 United States 2401 CustomerFeedback_sentiment3 10000 3 neutral 8352 Surname_Country_region 10000 4 Europe 4392 working_class 10000 2 working_age 9736 stage_of_life 10000 4 adult 6330 IsActiveMember 10000 2 1 5151 CustomerFeedback_sentiment5 10000 5 2 stars 7121 HasCreditCard 10000 2 1 7055 generation 10000 6 millennials 6367 Country_IncomeGroup 10000 1 High income 10000 Surname_Country_IncomeGroup 10000 5 High income 6686 Country 10000 3 France 5014
In [6]:
Copied!
missing_val_frac, fig = missing(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
missing_val_frac.head()
missing_val_frac, fig = missing(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
missing_val_frac.head()
Out[6]:
CreditScore 0.0 Surname_Country_region 0.0 stage_of_life 0.0 working_class 0.0 Surname_Country_IncomeGroup 0.0 dtype: float64
In [7]:
Copied!
zero_val_frac, fig = zero(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
zero_val_frac.head()
zero_val_frac, fig = zero(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
zero_val_frac.head()
Out[7]:
Exited 0.7963 Balance (EUR) 0.3617 Tenure 0.0413 CreditScore 0.0000 Surname_Country_subregion 0.0000 dtype: float64
In [8]:
Copied!
data_nunique, fig = nunique(
df=df_pd.drop(columns=id_cols),
scale="log",
plot=True,
)
data_nunique.head()
data_nunique, fig = nunique(
df=df_pd.drop(columns=id_cols),
scale="log",
plot=True,
)
data_nunique.head()
Out[8]:
Country_IncomeGroup 1 Country_hemisphere 1 Country_region 1 IsActiveMember 2 is_native 2 dtype: int64
In [9]:
Copied!
data_std, fig = std(
df=df_pd[cont_cols],
scale="log",
plot=True,
)
data_std.head()
data_std, fig = std(
df=df_pd[cont_cols],
scale="log",
plot=True,
)
data_std.head()
Out[9]:
Balance (EUR) 62397.405202 EstimatedSalary 57510.492818 Surname_Country_gdp_per_capita 25976.502611 Country_gdp_per_capita 6330.960841 CreditScore 96.653299 dtype: float64
In [10]:
Copied!
col_entropies, fig = entropy(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
col_entropies.head()
col_entropies, fig = entropy(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
col_entropies.head()
Out[10]:
EstimatedSalary 9.210202 Balance (EUR) 6.246510 CreditScore 5.873574 Age 3.677614 Surname_Country 2.481289 dtype: float64
Plotting¶
In [11]:
Copied!
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
Top 10 absolute correlations Balance (EUR) Country_gdp_per_capita 0.329762 NumberOfProducts 0.304180 Age NumberOfProducts 0.030680 Country_gdp_per_capita 0.029999 Balance (EUR) 0.028308 Tenure Surname_Country_gdp_per_capita 0.017138 NumberOfProducts EstimatedSalary 0.014204 Tenure NumberOfProducts 0.013444 Balance (EUR) EstimatedSalary 0.012797 Tenure Balance (EUR) 0.012254 dtype: float64
In [12]:
Copied!
fig = distributions(
df=df_pd[cont_cols],
low_per_cut=0,
high_per_cut=1,
type="box",
)
fig = distributions(
df=df_pd[cont_cols],
low_per_cut=0,
high_per_cut=1,
type="box",
)