Preprocess data¶

analysis ¶

init_check ¶

init_check(
    df,
    identifier=None,
    cat_cols=None,
    cont_cols=None,
    verbose=False,
)

Procedure to check: * duplicated rows in teh dataset * general stats of numerical features * general stats of categorical features

Parameters:

df (DataFrame) –
pandas dataframe
identifier (str) –
column which identifies unique user IDs
cat_cols (list) –
categorical features in the dataset
cont_cols (list) –
numerical features in the dataset

Returns:

duplicated_ids( int ) –
cont_cols_desc( pd.DataFrame ) –
cat_cols_desc( pd.DataFrame ) –

Source code in churn_pred/eda/features/analysis.py

def init_check(
    df: pd.DataFrame,
    identifier: Optional[str] = None,
    cat_cols: Optional[list] = None,
    cont_cols: Optional[list] = None,
    verbose: bool = False,
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    """Procedure to check:
    * duplicated rows in teh dataset
    * general stats of numerical features
    * general stats of categorical features

    Args:
        df (DataFrame): pandas dataframe
        identifier (str): column which identifies unique user IDs
        cat_cols (list): categorical features in the dataset
        cont_cols (list): numerical features in the dataset

    Returns:
        duplicated_ids (int):
        cont_cols_desc (pd.DataFrame):
        cat_cols_desc (pd.DataFrame):
    """
    data = df.copy()

    if identifier:
        duplicated_ids = data[identifier].duplicated().sum()
        if verbose:
            print("[CHECK] Number of duplicated ids: {}".format(duplicated_ids))

    if cont_cols:
        cont_cols_f = intsec(data.columns.values, cont_cols)
        cont_cols_desc = data[cont_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Numerical columns")
            with pd.option_context("display.precision", 2):
                print(cont_cols_desc)

    if cat_cols:
        cat_cols_f = intsec(data.columns.values, cat_cols)
        cat_cols_desc = data[cat_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Categorical columns")
            with pd.option_context("display.precision", 2):
                print(cat_cols_desc)
    return duplicated_ids, cont_cols_desc, cat_cols_desc

missing ¶

missing(df, scale='linear', plot=False)

Procedure to check fraction of missing values in the dataset.

Parameters:

df (DataFrame) –
pandas dataframe
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

missing_val_frac( DataFrame ) –
sorted dataframe with fraction of missing values per feature
fig( Figure ) –
plot with sorted fractions of missing values in each column

Source code in churn_pred/eda/features/analysis.py

def missing(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of missing values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        missing_val_frac (DataFrame): sorted dataframe with fraction of missing values
            per feature
        fig (Figure): plot with sorted fractions of missing values in each column
    """
    data = df.copy()
    missing_val_frac = data.isna().sum() / len(data)
    missing_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            missing_val_frac,
            ax,
            title="Fraction of missing values in the dataset",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return missing_val_frac, fig

zero ¶

zero(df, scale='linear', plot=False)

Procedure to check fraction of zero values in the dataset.

Parameters:

df (DataFrame) –
pandas dataframe
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

zero_val_frac( DataFrame ) –
sorted dataframe with fraction of '0' values per feature
fig( Figure ) –
plot with sorted fractions of '0' values in each column

Source code in churn_pred/eda/features/analysis.py

def zero(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of zero values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        zero_val_frac (DataFrame): sorted dataframe with fraction of '0' values per
            feature
        fig (Figure): plot with sorted fractions of '0' values in each column
    """
    data = df.copy()
    zero_val_frac = data.isin([0]).sum() / len(data)
    zero_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(zero_val_frac, ax, title="Fraction of zero values in the dataset")
    else:
        fig = None
    ax.set_yscale(scale)
    return zero_val_frac, fig

nunique ¶

nunique(df, scale='linear', plot=False)

Procedure to plot features sorted by their number of unique values.

Parameters:

df (DataFrame) –
dataset
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

fig( Figure ) –
plot with sorted number of unique values in features

Source code in churn_pred/eda/features/analysis.py

def nunique(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their number of unique values.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted number of unique values in features
    """
    data = df.copy()

    data_nunique = data.nunique()
    data_nunique.sort_values(ascending=True, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            data_nunique.dropna(),
            ax,
            title="Features number of unique values (NA are dropped)",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return data_nunique, fig

std ¶

std(df, scale='linear', plot=False)

Procedure to plot features sorted by their variance.

Parameters:

df (DataFrame) –
dataset
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

fig( Figure ) –
plot with sorted standard deviation of continuous features

Source code in churn_pred/eda/features/analysis.py

def std(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their variance.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted standard deviation of continuous features
    """
    data = df.copy()

    data_std = data.std()

    data_std.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(data_std.dropna(), ax, title="Features variance (NA are dropped)")
    else:
        fig = None
    ax.set_yscale(scale)
    return data_std, fig

entropy ¶

entropy(df, scale='linear', plot=False)

Procedure to plot features sorted by their entropy.

Parameters:

df (DataFrame) –
dataset
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

fig( Figure ) –
plot with sorted entropy of the features

Source code in churn_pred/eda/features/analysis.py

def entropy(
    df: pd.DataFrame, scale: Literal["log", "linear"] = "linear", plot: bool = False
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their entropy.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted entropy of the features
    """
    data = df.copy()

    data = data.astype(str)
    col_entropies = data.apply(entropy_calc, axis=0)
    col_entropies.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            col_entropies.dropna(), ax, title="Features entropies (NA are dropped)"
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return col_entropies, fig

plotting ¶

cross_correlation ¶

cross_correlation(df, n=10, verbose=False)

Procedure to calculate and plot cross-correlation of features in the dataset.

Parameters:

df (DataFrame) –
pandas dataframe
verbose (bool) –
show n most correlated features
n (int) –
number correlated features in verbose output

Returns:

fig( Figure ) –
heatmap of continuous features cross correlations

Source code in churn_pred/eda/features/plotting.py

def cross_correlation(
    df: pd.DataFrame,
    n: int = 10,
    verbose: bool = False,
) -> Figure:
    """Procedure to calculate and plot cross-correlation of features
    in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        verbose (bool): show n most correlated features
        n (int): number correlated features in verbose output

    Returns:
        fig (Figure): heatmap of continuous features cross correlations
    """
    data = df.copy()

    # corrwith uses numpy which does not like pandas Float dtypes
    corr_matrix = data.astype(float).corr()
    upper_stacked_sorted = (
        corr_matrix.abs()
        .where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
        .stack()
        .sort_values(ascending=False)
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_title("Cross-correlation matrix", size=20)
    sns.set(font_scale=1.0)
    sns.heatmap(
        corr_matrix,
        ax=ax,
        cbar=True,
        square=True,
        # linewidths=0.5,
        fmt=".2f",
    )
    if verbose:
        print("Top {} absolute correlations".format(n))
        print(upper_stacked_sorted[:n])
    return fig

distributions ¶

distributions(
    df, low_per_cut=0, high_per_cut=1, type="box"
)

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

df (DataFrame) –
pandas dataframe
low_per_cut (float) –
lower percentile where to cut the plot for better readability
high_per_cut (float) –
higher percentile where to cut the plot for better readability
type (str) –
type of distribution plot

Returns:

fig( Figure ) –
ditribution plot per each feature

Source code in churn_pred/eda/features/plotting.py

def distributions(
    df: pd.DataFrame,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
    type: Literal["box", "violin"] = "box",
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        low_per_cut (float): lower percentile where to cut the plot for better
            readability
        high_per_cut (float): higher percentile where to cut the plot for better
            readability
        type (str): type of distribution plot

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = df.copy()
    cols = data.columns.values
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle("Distribution of feature values", size=20)
    for col, i in zip(cols, range(len(cols))):
        if type == "violin":
            sns.violinplot(
                ax=ax[i],
                x=col,
                orient="h",
                # cut=0,
                # showextrem=False,
                data=data,
            )
        if type == "box":
            sns.boxplot(
                ax=ax[i],
                x=col,
                orient="h",
                data=data,
            )
        ax[i].set_xlim(
            data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
        )
        ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9)
    return fig

analysis ¶

correlation ¶

correlation(df, target, scale='linear', plot=False)

Procedure to plot most correlated numerical features with target column.

Parameters:

df (pd.DataFrame) –
pandas dataframe
target (pd.Series) –
target values
scale (str) –
y scale of the plot
plot (bool) –
whether to output the plot

Returns:

sorted_corr_cols( pd.DataFrame ) –
sorted dataframe with feature and target value correlation
fig( Figure ) –
sorted bar plot with feature and target value correlation

Source code in churn_pred/eda/target/analysis.py

def correlation(
    df: pd.DataFrame,
    target: pd.Series,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot most correlated numerical features with
    target column.

    Args:
        df (pd.DataFrame): pandas dataframe
        target (pd.Series): target values
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        sorted_corr_cols (pd.DataFrame): sorted dataframe with feature and target value
            correlation
        fig (Figure): sorted bar plot with feature and target value correlation
    """
    data = df.copy()
    target_col = target.name
    correlation_df = pd.concat([data, target], axis=1).corr()
    sorted_corr_cols = correlation_df[target_col].drop(index=[target_col])
    sorted_corr_cols.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            sorted_corr_cols,
            ax,
            title=f"Sorted features by their correlation with {target_col}",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return sorted_corr_cols, fig

plotting ¶

prob_distrib_per_class ¶

prob_distrib_per_class(predicted_probs, actual, task)

Procedure to plot probability density distributions per class from LightGBM predictions.

Parameters:

predicted_probs (ndarray) –
predicted probs
actual (ndarray) –
ground truth classes
task (str) –
type of task

Returns:

fig( Figure ) –
probability density ditributions plot per each class

Source code in churn_pred/eda/target/plotting.py

def prob_distrib_per_class(
    predicted_probs: np.ndarray,
    actual: np.ndarray,
    task: Literal["binary", "multiclass"],
) -> Figure:
    """Procedure to plot probability density distributions per class from LightGBM
    predictions.

    Args:
        predicted_probs (ndarray): predicted probs
        actual (ndarray): ground truth classes
        task (str): type of task

    Returns:
        fig (Figure): probability density ditributions plot per each class
    """
    if task == "binary":
        temp = pd.DataFrame({"predicted_proba": predicted_probs, "actual": actual})

        fig, ax = plt.subplots(figsize=(3, 2))
        fig.suptitle("Predicted probability density per class")
        for label, alpha, color in zip([0, 1], [1, 0.7], ["black", "red"]):
            ax.hist(
                temp[temp["actual"] == label]["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                alpha=alpha,
                color=color,
                ec="k",
            )
        ax.set_xlim([0, 1])
        ax.set_xlabel("probability")
        ax.set_ylabel("probability_density")
    elif task == "multiclass":
        temp = pd.DataFrame(
            {
                "predicted_proba": np.take_along_axis(
                    predicted_probs, np.vstack(actual), axis=1  # type: ignore
                ).flatten(),
                "actual": actual,
            }
        )

        # Dictionary of color for each label
        color_d = dict(
            zip_longest(
                temp["actual"].unique(),
                plt.rcParams["axes.prop_cycle"].by_key()["color"],
            )
        )

        n_classes = temp["actual"].nunique()
        fig, ax = plt.subplots(
            ncols=n_classes, figsize=(n_classes * 3, 2), sharex=True, sharey=True
        )
        fig.suptitle("Predicted probability density per class")
        plt.subplots_adjust(wspace=0.3, top=0.75)

        for i, (label, gp) in enumerate(temp.groupby("actual")):
            ax[i].hist(
                gp["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                color=color_d[label],
                ec="k",
            )
            ax[i].set_title(label)
            ax[i].set_xlim([0, 1])
            ax[i].set_xlabel("probability")
            ax[i].set_ylabel("probability_density")
    return fig

distributions_in_binary_cls ¶

distributions_in_binary_cls(
    df, target, low_per_cut=0, high_per_cut=1
)

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

df (DataFrame) –
pandas dataframe
target (pd.Series) –
target values, i.e. binary classes

Returns:

fig( Figure ) –
ditribution plot per each feature

Source code in churn_pred/eda/target/plotting.py

def distributions_in_binary_cls(
    df: pd.DataFrame,
    target: pd.Series,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        target (pd.Series): target values, i.e. binary classes

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = pd.concat([df, target], axis=1)
    cols = data.columns.values
    data["dummy"] = 0
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle(f"Distribution of feature values splitted by {target.name}", size=20)
    for col, i in zip(cols, range(len(cols))):
        if col != target.name:
            sns.violinplot(
                ax=ax[i],
                hue=target.name,
                x=col,
                y="dummy",
                orient="h",
                cut=0,
                showextrem=False,
                split=True,
                data=data,
            )
            ax[i].get_legend().remove()
            ax[i].set_xlim(
                data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
            )
            ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9, top=0.97)
    return fig

plotting ¶

bar_plot ¶

bar_plot(df, ax, title)

Helper method for unified bar plot

Parameters:

df (DataFrame) –
dataframe to plot
ax (Axes) –
axes defining where to plot it

Returns:

adjusted_plot( Axes ) –
adjusted axes

Source code in churn_pred/eda/plotting.py

def bar_plot(df: pd.DataFrame, ax: Axes, title: str) -> Axes:
    """Helper method for unified bar plot

    Args:
        df (DataFrame): dataframe to plot
        ax (Axes): axes defining where to plot it

    Returns:
        adjusted_plot (Axes): adjusted axes
    """
    data = df.copy()
    ax.set_title(title, size=20)
    ax.tick_params(labelsize=8)
    ax.set_xlabel("sorted features", size=12)
    return data.plot.bar(ax=ax)

general_utils ¶

intsec ¶

intsec(list1, list2)

Simple intesection of two lists.

Parameters:

list1 (list) –
list1
list2 (list) –
list2

Returns:

list( list ) –
intersection of lists

Source code in churn_pred/eda/general_utils.py

def intsec(list1: list, list2: list) -> list:
    """Simple intesection of two lists.
    Args:
        list1 (list): list1
        list2 (list): list2
    Returns:
        list (list): intersection of lists
    """
    return list(set.intersection(set(list1), set(list2)))

entropy_calc ¶

entropy_calc(labels, base=np.e)

Computes entropy of both continuous and categorical features. Shamelessly stolen from : https://stackoverflow.com/a/45091961

Parameters:

labels (list, ndarray, Series) –
list of values

Returns:

ent( float ) –
entropy of the list of values

Source code in churn_pred/eda/general_utils.py

def entropy_calc(labels: list, base: float = np.e) -> float:
    """Computes entropy of both continuous and categorical features.
    Shamelessly stolen from :
    https://stackoverflow.com/a/45091961
    Args:
        labels (list, ndarray, Series): list of values
    Returns:
        ent (float): entropy of the list of values
    """
    n_labels = len(labels)
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    ent = float(0)
    if n_classes > 1:
        for i in probs:
            ent -= i * math.log(i, base)
    return ent