Skip to content

Preprocess data

analysis

init_check

init_check(
    df,
    identifier=None,
    cat_cols=None,
    cont_cols=None,
    verbose=False,
)

Procedure to check: * duplicated rows in teh dataset * general stats of numerical features * general stats of categorical features

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • identifier (str) –

    column which identifies unique user IDs

  • cat_cols (list) –

    categorical features in the dataset

  • cont_cols (list) –

    numerical features in the dataset

Returns:

  • duplicated_ids( int ) –
  • cont_cols_desc( pd.DataFrame ) –
  • cat_cols_desc( pd.DataFrame ) –
Source code in churn_pred/eda/features/analysis.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def init_check(
    df: pd.DataFrame,
    identifier: Optional[str] = None,
    cat_cols: Optional[list] = None,
    cont_cols: Optional[list] = None,
    verbose: bool = False,
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    """Procedure to check:
    * duplicated rows in teh dataset
    * general stats of numerical features
    * general stats of categorical features

    Args:
        df (DataFrame): pandas dataframe
        identifier (str): column which identifies unique user IDs
        cat_cols (list): categorical features in the dataset
        cont_cols (list): numerical features in the dataset

    Returns:
        duplicated_ids (int):
        cont_cols_desc (pd.DataFrame):
        cat_cols_desc (pd.DataFrame):
    """
    data = df.copy()

    if identifier:
        duplicated_ids = data[identifier].duplicated().sum()
        if verbose:
            print("[CHECK] Number of duplicated ids: {}".format(duplicated_ids))

    if cont_cols:
        cont_cols_f = intsec(data.columns.values, cont_cols)
        cont_cols_desc = data[cont_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Numerical columns")
            with pd.option_context("display.precision", 2):
                print(cont_cols_desc)

    if cat_cols:
        cat_cols_f = intsec(data.columns.values, cat_cols)
        cat_cols_desc = data[cat_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Categorical columns")
            with pd.option_context("display.precision", 2):
                print(cat_cols_desc)
    return duplicated_ids, cont_cols_desc, cat_cols_desc

missing

missing(df, scale='linear', plot=False)

Procedure to check fraction of missing values in the dataset.

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • missing_val_frac( DataFrame ) –

    sorted dataframe with fraction of missing values per feature

  • fig( Figure ) –

    plot with sorted fractions of missing values in each column

Source code in churn_pred/eda/features/analysis.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def missing(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of missing values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        missing_val_frac (DataFrame): sorted dataframe with fraction of missing values
            per feature
        fig (Figure): plot with sorted fractions of missing values in each column
    """
    data = df.copy()
    missing_val_frac = data.isna().sum() / len(data)
    missing_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            missing_val_frac,
            ax,
            title="Fraction of missing values in the dataset",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return missing_val_frac, fig

zero

zero(df, scale='linear', plot=False)

Procedure to check fraction of zero values in the dataset.

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • zero_val_frac( DataFrame ) –

    sorted dataframe with fraction of '0' values per feature

  • fig( Figure ) –

    plot with sorted fractions of '0' values in each column

Source code in churn_pred/eda/features/analysis.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def zero(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of zero values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        zero_val_frac (DataFrame): sorted dataframe with fraction of '0' values per
            feature
        fig (Figure): plot with sorted fractions of '0' values in each column
    """
    data = df.copy()
    zero_val_frac = data.isin([0]).sum() / len(data)
    zero_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(zero_val_frac, ax, title="Fraction of zero values in the dataset")
    else:
        fig = None
    ax.set_yscale(scale)
    return zero_val_frac, fig

nunique

nunique(df, scale='linear', plot=False)

Procedure to plot features sorted by their number of unique values.

Parameters:

  • df (DataFrame) –

    dataset

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • fig( Figure ) –

    plot with sorted number of unique values in features

Source code in churn_pred/eda/features/analysis.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def nunique(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their number of unique values.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted number of unique values in features
    """
    data = df.copy()

    data_nunique = data.nunique()
    data_nunique.sort_values(ascending=True, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            data_nunique.dropna(),
            ax,
            title="Features number of unique values (NA are dropped)",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return data_nunique, fig

std

std(df, scale='linear', plot=False)

Procedure to plot features sorted by their variance.

Parameters:

  • df (DataFrame) –

    dataset

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • fig( Figure ) –

    plot with sorted standard deviation of continuous features

Source code in churn_pred/eda/features/analysis.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def std(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their variance.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted standard deviation of continuous features
    """
    data = df.copy()

    data_std = data.std()

    data_std.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(data_std.dropna(), ax, title="Features variance (NA are dropped)")
    else:
        fig = None
    ax.set_yscale(scale)
    return data_std, fig

entropy

entropy(df, scale='linear', plot=False)

Procedure to plot features sorted by their entropy.

Parameters:

  • df (DataFrame) –

    dataset

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • fig( Figure ) –

    plot with sorted entropy of the features

Source code in churn_pred/eda/features/analysis.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def entropy(
    df: pd.DataFrame, scale: Literal["log", "linear"] = "linear", plot: bool = False
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their entropy.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted entropy of the features
    """
    data = df.copy()

    data = data.astype(str)
    col_entropies = data.apply(entropy_calc, axis=0)
    col_entropies.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            col_entropies.dropna(), ax, title="Features entropies (NA are dropped)"
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return col_entropies, fig

plotting

cross_correlation

cross_correlation(df, n=10, verbose=False)

Procedure to calculate and plot cross-correlation of features in the dataset.

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • verbose (bool) –

    show n most correlated features

  • n (int) –

    number correlated features in verbose output

Returns:

  • fig( Figure ) –

    heatmap of continuous features cross correlations

Source code in churn_pred/eda/features/plotting.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def cross_correlation(
    df: pd.DataFrame,
    n: int = 10,
    verbose: bool = False,
) -> Figure:
    """Procedure to calculate and plot cross-correlation of features
    in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        verbose (bool): show n most correlated features
        n (int): number correlated features in verbose output

    Returns:
        fig (Figure): heatmap of continuous features cross correlations
    """
    data = df.copy()

    # corrwith uses numpy which does not like pandas Float dtypes
    corr_matrix = data.astype(float).corr()
    upper_stacked_sorted = (
        corr_matrix.abs()
        .where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
        .stack()
        .sort_values(ascending=False)
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_title("Cross-correlation matrix", size=20)
    sns.set(font_scale=1.0)
    sns.heatmap(
        corr_matrix,
        ax=ax,
        cbar=True,
        square=True,
        # linewidths=0.5,
        fmt=".2f",
    )
    if verbose:
        print("Top {} absolute correlations".format(n))
        print(upper_stacked_sorted[:n])
    return fig

distributions

distributions(
    df, low_per_cut=0, high_per_cut=1, type="box"
)

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • low_per_cut (float) –

    lower percentile where to cut the plot for better readability

  • high_per_cut (float) –

    higher percentile where to cut the plot for better readability

  • type (str) –

    type of distribution plot

Returns:

  • fig( Figure ) –

    ditribution plot per each feature

Source code in churn_pred/eda/features/plotting.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def distributions(
    df: pd.DataFrame,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
    type: Literal["box", "violin"] = "box",
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        low_per_cut (float): lower percentile where to cut the plot for better
            readability
        high_per_cut (float): higher percentile where to cut the plot for better
            readability
        type (str): type of distribution plot

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = df.copy()
    cols = data.columns.values
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle("Distribution of feature values", size=20)
    for col, i in zip(cols, range(len(cols))):
        if type == "violin":
            sns.violinplot(
                ax=ax[i],
                x=col,
                orient="h",
                # cut=0,
                # showextrem=False,
                data=data,
            )
        if type == "box":
            sns.boxplot(
                ax=ax[i],
                x=col,
                orient="h",
                data=data,
            )
        ax[i].set_xlim(
            data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
        )
        ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9)
    return fig

analysis

correlation

correlation(df, target, scale='linear', plot=False)

Procedure to plot most correlated numerical features with target column.

Parameters:

  • df (pd.DataFrame) –

    pandas dataframe

  • target (pd.Series) –

    target values

  • scale (str) –

    y scale of the plot

  • plot (bool) –

    whether to output the plot

Returns:

  • sorted_corr_cols( pd.DataFrame ) –

    sorted dataframe with feature and target value correlation

  • fig( Figure ) –

    sorted bar plot with feature and target value correlation

Source code in churn_pred/eda/target/analysis.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def correlation(
    df: pd.DataFrame,
    target: pd.Series,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot most correlated numerical features with
    target column.

    Args:
        df (pd.DataFrame): pandas dataframe
        target (pd.Series): target values
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        sorted_corr_cols (pd.DataFrame): sorted dataframe with feature and target value
            correlation
        fig (Figure): sorted bar plot with feature and target value correlation
    """
    data = df.copy()
    target_col = target.name
    correlation_df = pd.concat([data, target], axis=1).corr()
    sorted_corr_cols = correlation_df[target_col].drop(index=[target_col])
    sorted_corr_cols.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            sorted_corr_cols,
            ax,
            title=f"Sorted features by their correlation with {target_col}",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return sorted_corr_cols, fig

plotting

prob_distrib_per_class

prob_distrib_per_class(predicted_probs, actual, task)

Procedure to plot probability density distributions per class from LightGBM predictions.

Parameters:

  • predicted_probs (ndarray) –

    predicted probs

  • actual (ndarray) –

    ground truth classes

  • task (str) –

    type of task

Returns:

  • fig( Figure ) –

    probability density ditributions plot per each class

Source code in churn_pred/eda/target/plotting.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def prob_distrib_per_class(
    predicted_probs: np.ndarray,
    actual: np.ndarray,
    task: Literal["binary", "multiclass"],
) -> Figure:
    """Procedure to plot probability density distributions per class from LightGBM
    predictions.

    Args:
        predicted_probs (ndarray): predicted probs
        actual (ndarray): ground truth classes
        task (str): type of task

    Returns:
        fig (Figure): probability density ditributions plot per each class
    """
    if task == "binary":
        temp = pd.DataFrame({"predicted_proba": predicted_probs, "actual": actual})

        fig, ax = plt.subplots(figsize=(3, 2))
        fig.suptitle("Predicted probability density per class")
        for label, alpha, color in zip([0, 1], [1, 0.7], ["black", "red"]):
            ax.hist(
                temp[temp["actual"] == label]["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                alpha=alpha,
                color=color,
                ec="k",
            )
        ax.set_xlim([0, 1])
        ax.set_xlabel("probability")
        ax.set_ylabel("probability_density")
    elif task == "multiclass":
        temp = pd.DataFrame(
            {
                "predicted_proba": np.take_along_axis(
                    predicted_probs, np.vstack(actual), axis=1  # type: ignore
                ).flatten(),
                "actual": actual,
            }
        )

        # Dictionary of color for each label
        color_d = dict(
            zip_longest(
                temp["actual"].unique(),
                plt.rcParams["axes.prop_cycle"].by_key()["color"],
            )
        )

        n_classes = temp["actual"].nunique()
        fig, ax = plt.subplots(
            ncols=n_classes, figsize=(n_classes * 3, 2), sharex=True, sharey=True
        )
        fig.suptitle("Predicted probability density per class")
        plt.subplots_adjust(wspace=0.3, top=0.75)

        for i, (label, gp) in enumerate(temp.groupby("actual")):
            ax[i].hist(
                gp["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                color=color_d[label],
                ec="k",
            )
            ax[i].set_title(label)
            ax[i].set_xlim([0, 1])
            ax[i].set_xlabel("probability")
            ax[i].set_ylabel("probability_density")
    return fig

distributions_in_binary_cls

distributions_in_binary_cls(
    df, target, low_per_cut=0, high_per_cut=1
)

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

  • df (DataFrame) –

    pandas dataframe

  • target (pd.Series) –

    target values, i.e. binary classes

Returns:

  • fig( Figure ) –

    ditribution plot per each feature

Source code in churn_pred/eda/target/plotting.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def distributions_in_binary_cls(
    df: pd.DataFrame,
    target: pd.Series,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        target (pd.Series): target values, i.e. binary classes

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = pd.concat([df, target], axis=1)
    cols = data.columns.values
    data["dummy"] = 0
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle(f"Distribution of feature values splitted by {target.name}", size=20)
    for col, i in zip(cols, range(len(cols))):
        if col != target.name:
            sns.violinplot(
                ax=ax[i],
                hue=target.name,
                x=col,
                y="dummy",
                orient="h",
                cut=0,
                showextrem=False,
                split=True,
                data=data,
            )
            ax[i].get_legend().remove()
            ax[i].set_xlim(
                data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
            )
            ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9, top=0.97)
    return fig

plotting

bar_plot

bar_plot(df, ax, title)

Helper method for unified bar plot

Parameters:

  • df (DataFrame) –

    dataframe to plot

  • ax (Axes) –

    axes defining where to plot it

Returns:

  • adjusted_plot( Axes ) –

    adjusted axes

Source code in churn_pred/eda/plotting.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def bar_plot(df: pd.DataFrame, ax: Axes, title: str) -> Axes:
    """Helper method for unified bar plot

    Args:
        df (DataFrame): dataframe to plot
        ax (Axes): axes defining where to plot it

    Returns:
        adjusted_plot (Axes): adjusted axes
    """
    data = df.copy()
    ax.set_title(title, size=20)
    ax.tick_params(labelsize=8)
    ax.set_xlabel("sorted features", size=12)
    return data.plot.bar(ax=ax)

general_utils

intsec

intsec(list1, list2)

Simple intesection of two lists.

Parameters:

  • list1 (list) –

    list1

  • list2 (list) –

    list2

Returns:

  • list( list ) –

    intersection of lists

Source code in churn_pred/eda/general_utils.py
 6
 7
 8
 9
10
11
12
13
14
def intsec(list1: list, list2: list) -> list:
    """Simple intesection of two lists.
    Args:
        list1 (list): list1
        list2 (list): list2
    Returns:
        list (list): intersection of lists
    """
    return list(set.intersection(set(list1), set(list2)))

entropy_calc

entropy_calc(labels, base=np.e)

Computes entropy of both continuous and categorical features. Shamelessly stolen from : https://stackoverflow.com/a/45091961

Parameters:

  • labels (list, ndarray, Series) –

    list of values

Returns:

  • ent( float ) –

    entropy of the list of values

Source code in churn_pred/eda/general_utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def entropy_calc(labels: list, base: float = np.e) -> float:
    """Computes entropy of both continuous and categorical features.
    Shamelessly stolen from :
    https://stackoverflow.com/a/45091961
    Args:
        labels (list, ndarray, Series): list of values
    Returns:
        ent (float): entropy of the list of values
    """
    n_labels = len(labels)
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    ent = float(0)
    if n_classes > 1:
        for i in probs:
            ent -= i * math.log(i, base)
    return ent