Preprocess data¶

`entropy(df, scale='linear', plot=False)` ¶

Procedure to plot features sorted by their entropy.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataset	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`fig`	`Figure`	plot with sorted entropy of the features

Source code in inference_model/eda/features/analysis.py

def entropy(
    df: pd.DataFrame, scale: Literal["log", "linear"] = "linear", plot: bool = False
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their entropy.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted entropy of the features
    """
    data = df.copy()

    data = data.astype(str)
    col_entropies = data.apply(entropy_calc, axis=0)
    col_entropies.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            col_entropies.dropna(), ax, title="Features entropies (NA are dropped)"
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return col_entropies, fig

`init_check(df, identifier=None, cat_cols=None, cont_cols=None, verbose=False)` ¶

Procedure to check: * duplicated rows in teh dataset * general stats of numerical features * general stats of categorical features

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`identifier`	`str`	column which identifies unique user IDs	`None`
`cat_cols`	`list`	categorical features in the dataset	`None`
`cont_cols`	`list`	numerical features in the dataset	`None`

Returns:

Name	Type	Description
`duplicated_ids`	`int`
`cont_cols_desc`	`DataFrame`
`cat_cols_desc`	`DataFrame`

Source code in inference_model/eda/features/analysis.py

def init_check(
    df: pd.DataFrame,
    identifier: Optional[str] = None,
    cat_cols: Optional[list] = None,
    cont_cols: Optional[list] = None,
    verbose: bool = False,
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    """Procedure to check:
    * duplicated rows in teh dataset
    * general stats of numerical features
    * general stats of categorical features

    Args:
        df (DataFrame): pandas dataframe
        identifier (str): column which identifies unique user IDs
        cat_cols (list): categorical features in the dataset
        cont_cols (list): numerical features in the dataset

    Returns:
        duplicated_ids (int):
        cont_cols_desc (pd.DataFrame):
        cat_cols_desc (pd.DataFrame):
    """
    data = df.copy()

    if identifier:
        duplicated_ids = data[identifier].duplicated().sum()
        if verbose:
            print("[CHECK] Number of duplicated ids: {}".format(duplicated_ids))
    else:
        duplicated_ids = None

    if cont_cols:
        cont_cols_f = intsec(data.columns.values, cont_cols)
        cont_cols_desc = data[cont_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Numerical columns")
            with pd.option_context("display.precision", 2):
                print(cont_cols_desc)
    else:
        cont_cols_desc = None

    if cat_cols:
        cat_cols_f = intsec(data.columns.values, cat_cols)
        cat_cols_desc = data[cat_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Categorical columns")
            with pd.option_context("display.precision", 2):
                print(cat_cols_desc)
    else:
        cat_cols_desc = None

    return duplicated_ids, cont_cols_desc, cat_cols_desc

`missing(df, scale='linear', plot=False)` ¶

Procedure to check fraction of missing values in the dataset.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`missing_val_frac`	`DataFrame`	sorted dataframe with fraction of missing values per feature
`fig`	`Figure`	plot with sorted fractions of missing values in each column

Source code in inference_model/eda/features/analysis.py

def missing(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of missing values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        missing_val_frac (DataFrame): sorted dataframe with fraction of missing values
            per feature
        fig (Figure): plot with sorted fractions of missing values in each column
    """
    data = df.copy()
    missing_val_frac = data.isna().sum() / len(data)
    missing_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            missing_val_frac,
            ax,
            title="Fraction of missing values in the dataset",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return missing_val_frac, fig

`nunique(df, scale='linear', plot=False)` ¶

Procedure to plot features sorted by their number of unique values.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataset	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`fig`	`Figure`	plot with sorted number of unique values in features

Source code in inference_model/eda/features/analysis.py

def nunique(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their number of unique values.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted number of unique values in features
    """
    data = df.copy()

    data_nunique = data.nunique()
    data_nunique.sort_values(ascending=True, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            data_nunique.dropna(),
            ax,
            title="Features number of unique values (NA are dropped)",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return data_nunique, fig

`std(df, scale='linear', plot=False)` ¶

Procedure to plot features sorted by their variance.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataset	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`fig`	`Figure`	plot with sorted standard deviation of continuous features

Source code in inference_model/eda/features/analysis.py

def std(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their variance.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted standard deviation of continuous features
    """
    data = df.copy()

    data_std = data.std()

    data_std.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(data_std.dropna(), ax, title="Features variance (NA are dropped)")
    else:
        fig = None
    ax.set_yscale(scale)
    return data_std, fig

`zero(df, scale='linear', plot=False)` ¶

Procedure to check fraction of zero values in the dataset.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`zero_val_frac`	`DataFrame`	sorted dataframe with fraction of '0' values per feature
`fig`	`Figure`	plot with sorted fractions of '0' values in each column

Source code in inference_model/eda/features/analysis.py

def zero(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of zero values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        zero_val_frac (DataFrame): sorted dataframe with fraction of '0' values per
            feature
        fig (Figure): plot with sorted fractions of '0' values in each column
    """
    data = df.copy()
    zero_val_frac = data.isin([0]).sum() / len(data)
    zero_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(zero_val_frac, ax, title="Fraction of zero values in the dataset")
    else:
        fig = None
    ax.set_yscale(scale)
    return zero_val_frac, fig

`cross_correlation(df, n=10, verbose=False)` ¶

Procedure to calculate and plot cross-correlation of features in the dataset.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`verbose`	`bool`	show n most correlated features	`False`
`n`	`int`	number correlated features in verbose output	`10`

Returns:

Name	Type	Description
`fig`	`Figure`	heatmap of continuous features cross correlations

Source code in inference_model/eda/features/plotting.py

def cross_correlation(
    df: pd.DataFrame,
    n: int = 10,
    verbose: bool = False,
) -> Figure:
    """Procedure to calculate and plot cross-correlation of features
    in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        verbose (bool): show n most correlated features
        n (int): number correlated features in verbose output

    Returns:
        fig (Figure): heatmap of continuous features cross correlations
    """
    data = df.copy()

    # corrwith uses numpy which does not like pandas Float dtypes
    corr_matrix = data.astype(float).corr()
    upper_stacked_sorted = (
        corr_matrix.abs()
        .where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
        .stack()
        .sort_values(ascending=False)
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_title("Cross-correlation matrix", size=20)
    sns.set(font_scale=1.0)
    sns.heatmap(
        corr_matrix,
        ax=ax,
        cbar=True,
        square=True,
        # linewidths=0.5,
        fmt=".2f",
    )
    if verbose:
        print("Top {} absolute correlations".format(n))
        print(upper_stacked_sorted[:n])
    return fig

`distributions(df, low_per_cut=0, high_per_cut=1, type='box')` ¶

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`low_per_cut`	`float`	lower percentile where to cut the plot for better readability	`0`
`high_per_cut`	`float`	higher percentile where to cut the plot for better readability	`1`
`type`	`str`	type of distribution plot	`'box'`

Returns:

Name	Type	Description
`fig`	`Figure`	ditribution plot per each feature

Source code in inference_model/eda/features/plotting.py

def distributions(
    df: pd.DataFrame,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
    type: Literal["box", "violin"] = "box",
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        low_per_cut (float): lower percentile where to cut the plot for better
            readability
        high_per_cut (float): higher percentile where to cut the plot for better
            readability
        type (str): type of distribution plot

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = df.copy()
    cols = data.columns.values
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle("Distribution of feature values", size=20)
    for col, i in zip(cols, range(len(cols))):
        if type == "violin":
            sns.violinplot(
                ax=ax[i],
                x=col,
                orient="h",
                # cut=0,
                # showextrem=False,
                data=data,
            )
        if type == "box":
            sns.boxplot(
                ax=ax[i],
                x=col,
                orient="h",
                data=data,
            )
        ax[i].set_xlim(
            data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
        )
        ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9)
    return fig

`correlation(df, target, scale='linear', plot=False)` ¶

Procedure to plot most correlated numerical features with target column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`target`	`Series`	target values	required
`scale`	`str`	y scale of the plot	`'linear'`
`plot`	`bool`	whether to output the plot	`False`

Returns:

Name	Type	Description
`sorted_corr_cols`	`DataFrame`	sorted dataframe with feature and target value correlation
`fig`	`Figure`	sorted bar plot with feature and target value correlation

Source code in inference_model/eda/target/analysis.py

def correlation(
    df: pd.DataFrame,
    target: pd.Series,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot most correlated numerical features with
    target column.

    Args:
        df (pd.DataFrame): pandas dataframe
        target (pd.Series): target values
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        sorted_corr_cols (pd.DataFrame): sorted dataframe with feature and target value
            correlation
        fig (Figure): sorted bar plot with feature and target value correlation
    """
    data = df.copy()
    target_col = target.name
    correlation_df = pd.concat([data, target], axis=1).corr()
    sorted_corr_cols = correlation_df[target_col].drop(index=[target_col])
    sorted_corr_cols.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            sorted_corr_cols,
            ax,
            title=f"Sorted features by their correlation with {target_col}",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return sorted_corr_cols, fig

`distributions_in_binary_cls(df, target, low_per_cut=0, high_per_cut=1)` ¶

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	pandas dataframe	required
`cont_cols`	`list`	numerical features in the dataset	required
`target`	`Series`	target values, i.e. binary classes	required

Returns:

Name	Type	Description
`fig`	`Figure`	ditribution plot per each feature

Source code in inference_model/eda/target/plotting.py

def distributions_in_binary_cls(
    df: pd.DataFrame,
    target: pd.Series,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        cont_cols (list): numerical features in the dataset
        target (pd.Series): target values, i.e. binary classes

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = pd.concat([df, target], axis=1)
    cols = data.columns.values
    data["dummy"] = 0
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle(f"Distribution of feature values splitted by {target.name}", size=20)
    for col, i in zip(cols, range(len(cols))):
        if col != target.name:
            sns.violinplot(
                ax=ax[i],
                hue=target.name,
                x=col,
                y="dummy",
                orient="h",
                cut=0,
                showextrem=False,
                split=True,
                data=data,
            )
            ax[i].get_legend().remove()
            ax[i].set_xlim(
                data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
            )
            ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9, top=0.97)
    return fig

`prob_distrib_per_class(predicted_probs, actual, task)` ¶

Procedure to plot probability density distributions per class from LightGBM predictions.

Parameters:

Name	Type	Description	Default
`predicted_probs`	`ndarray`	predicted probs	required
`actual`	`ndarray`	ground truth classes	required
`task`	`str`	type of task	required

Returns:

Name	Type	Description
`fig`	`Figure`	probability density ditributions plot per each class

Source code in inference_model/eda/target/plotting.py

def prob_distrib_per_class(
    predicted_probs: np.ndarray,
    actual: np.ndarray,
    task: Literal["binary", "multiclass"],
) -> Figure:
    """Procedure to plot probability density distributions per class from LightGBM
    predictions.

    Args:
        predicted_probs (ndarray): predicted probs
        actual (ndarray): ground truth classes
        task (str): type of task

    Returns:
        fig (Figure): probability density ditributions plot per each class
    """
    if task == "binary":
        temp = pd.DataFrame({"predicted_proba": predicted_probs, "actual": actual})

        fig, ax = plt.subplots(figsize=(3, 2))
        fig.suptitle("Predicted probability density per class")
        for label, alpha, color in zip([0, 1], [1, 0.7], ["black", "red"]):
            ax.hist(
                temp[temp["actual"] == label]["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                alpha=alpha,
                color=color,
                ec="k",
            )
        ax.set_xlim([0, 1])
        ax.set_xlabel("probability")
        ax.set_ylabel("probability_density")
    elif task == "multiclass":
        temp = pd.DataFrame(
            {
                "predicted_proba": np.take_along_axis(
                    predicted_probs, np.vstack(actual), axis=1  # type: ignore
                ).flatten(),
                "actual": actual,
            }
        )

        # Dictionary of color for each label
        color_d = dict(
            zip_longest(
                temp["actual"].unique(),
                plt.rcParams["axes.prop_cycle"].by_key()["color"],
            )
        )

        n_classes = temp["actual"].nunique()
        fig, ax = plt.subplots(
            ncols=n_classes, figsize=(n_classes * 3, 2), sharex=True, sharey=True
        )
        fig.suptitle("Predicted probability density per class")
        plt.subplots_adjust(wspace=0.3, top=0.75)

        for i, (label, gp) in enumerate(temp.groupby("actual")):
            ax[i].hist(
                gp["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                color=color_d[label],
                ec="k",
            )
            ax[i].set_title(label)
            ax[i].set_xlim([0, 1])
            ax[i].set_xlabel("probability")
            ax[i].set_ylabel("probability_density")
    return fig

`bar_plot(df, ax, title)` ¶

Helper method for unified bar plot

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe to plot	required
`ax`	`Axes`	axes defining where to plot it	required

Returns:

Name	Type	Description
`adjusted_plot`	`Axes`	adjusted axes

Source code in inference_model/eda/plotting.py

def bar_plot(df: pd.DataFrame, ax: Axes, title: str) -> Axes:
    """Helper method for unified bar plot

    Args:
        df (DataFrame): dataframe to plot
        ax (Axes): axes defining where to plot it

    Returns:
        adjusted_plot (Axes): adjusted axes
    """
    data = df.copy()
    ax.set_title(title, size=20)
    ax.tick_params(labelsize=8)
    ax.set_xlabel("sorted features", size=12)
    return data.plot.bar(ax=ax)

`entropy_calc(labels, base=np.e)` ¶

Computes entropy of both continuous and categorical features. Shamelessly stolen from : https://stackoverflow.com/a/45091961 Args: labels (list, ndarray, Series): list of values Returns: ent (float): entropy of the list of values

Source code in inference_model/eda/general_utils.py

def entropy_calc(labels: list, base: float = np.e) -> float:
    """Computes entropy of both continuous and categorical features.
    Shamelessly stolen from :
    https://stackoverflow.com/a/45091961
    Args:
        labels (list, ndarray, Series): list of values
    Returns:
        ent (float): entropy of the list of values
    """
    n_labels = len(labels)
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    ent = float(0)
    if n_classes > 1:
        for i in probs:
            ent -= i * math.log(i, base)
    return ent

`intsec(list1, list2)` ¶

Simple intesection of two lists. Args: list1 (list): list1 list2 (list): list2 Returns: list (list): intersection of lists

Source code in inference_model/eda/general_utils.py

def intsec(list1: list, list2: list) -> list:
    """Simple intesection of two lists.
    Args:
        list1 (list): list1
        list2 (list): list2
    Returns:
        list (list): intersection of lists
    """
    return list(set.intersection(set(list1), set(list2)))

Preprocess data¶

entropy(df, scale='linear', plot=False) ¶

init_check(df, identifier=None, cat_cols=None, cont_cols=None, verbose=False) ¶

missing(df, scale='linear', plot=False) ¶

nunique(df, scale='linear', plot=False) ¶

std(df, scale='linear', plot=False) ¶

zero(df, scale='linear', plot=False) ¶

cross_correlation(df, n=10, verbose=False) ¶

distributions(df, low_per_cut=0, high_per_cut=1, type='box') ¶

correlation(df, target, scale='linear', plot=False) ¶

distributions_in_binary_cls(df, target, low_per_cut=0, high_per_cut=1) ¶

prob_distrib_per_class(predicted_probs, actual, task) ¶

bar_plot(df, ax, title) ¶

entropy_calc(labels, base=np.e) ¶

intsec(list1, list2) ¶

`entropy(df, scale='linear', plot=False)` ¶

`init_check(df, identifier=None, cat_cols=None, cont_cols=None, verbose=False)` ¶

`missing(df, scale='linear', plot=False)` ¶

`nunique(df, scale='linear', plot=False)` ¶

`std(df, scale='linear', plot=False)` ¶

`zero(df, scale='linear', plot=False)` ¶

`cross_correlation(df, n=10, verbose=False)` ¶

`distributions(df, low_per_cut=0, high_per_cut=1, type='box')` ¶

`correlation(df, target, scale='linear', plot=False)` ¶

`distributions_in_binary_cls(df, target, low_per_cut=0, high_per_cut=1)` ¶

`prob_distrib_per_class(predicted_probs, actual, task)` ¶

`bar_plot(df, ax, title)` ¶

`entropy_calc(labels, base=np.e)` ¶

`intsec(list1, list2)` ¶