Skip to content

Preprocess data

entropy(df, scale='linear', plot=False)

Procedure to plot features sorted by their entropy.

Parameters:

Name Type Description Default
df DataFrame

dataset

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
fig Figure

plot with sorted entropy of the features

Source code in inference_model/eda/features/analysis.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def entropy(
    df: pd.DataFrame, scale: Literal["log", "linear"] = "linear", plot: bool = False
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their entropy.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted entropy of the features
    """
    data = df.copy()

    data = data.astype(str)
    col_entropies = data.apply(entropy_calc, axis=0)
    col_entropies.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            col_entropies.dropna(), ax, title="Features entropies (NA are dropped)"
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return col_entropies, fig

init_check(df, identifier=None, cat_cols=None, cont_cols=None, verbose=False)

Procedure to check: * duplicated rows in teh dataset * general stats of numerical features * general stats of categorical features

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
identifier str

column which identifies unique user IDs

None
cat_cols list

categorical features in the dataset

None
cont_cols list

numerical features in the dataset

None

Returns:

Name Type Description
duplicated_ids int
cont_cols_desc DataFrame
cat_cols_desc DataFrame
Source code in inference_model/eda/features/analysis.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def init_check(
    df: pd.DataFrame,
    identifier: Optional[str] = None,
    cat_cols: Optional[list] = None,
    cont_cols: Optional[list] = None,
    verbose: bool = False,
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
    """Procedure to check:
    * duplicated rows in teh dataset
    * general stats of numerical features
    * general stats of categorical features

    Args:
        df (DataFrame): pandas dataframe
        identifier (str): column which identifies unique user IDs
        cat_cols (list): categorical features in the dataset
        cont_cols (list): numerical features in the dataset

    Returns:
        duplicated_ids (int):
        cont_cols_desc (pd.DataFrame):
        cat_cols_desc (pd.DataFrame):
    """
    data = df.copy()

    if identifier:
        duplicated_ids = data[identifier].duplicated().sum()
        if verbose:
            print("[CHECK] Number of duplicated ids: {}".format(duplicated_ids))
    else:
        duplicated_ids = None

    if cont_cols:
        cont_cols_f = intsec(data.columns.values, cont_cols)
        cont_cols_desc = data[cont_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Numerical columns")
            with pd.option_context("display.precision", 2):
                print(cont_cols_desc)
    else:
        cont_cols_desc = None

    if cat_cols:
        cat_cols_f = intsec(data.columns.values, cat_cols)
        cat_cols_desc = data[cat_cols_f].describe().transpose()
        if verbose:
            print("[CHECK] Categorical columns")
            with pd.option_context("display.precision", 2):
                print(cat_cols_desc)
    else:
        cat_cols_desc = None

    return duplicated_ids, cont_cols_desc, cat_cols_desc

missing(df, scale='linear', plot=False)

Procedure to check fraction of missing values in the dataset.

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
missing_val_frac DataFrame

sorted dataframe with fraction of missing values per feature

fig Figure

plot with sorted fractions of missing values in each column

Source code in inference_model/eda/features/analysis.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def missing(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of missing values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        missing_val_frac (DataFrame): sorted dataframe with fraction of missing values
            per feature
        fig (Figure): plot with sorted fractions of missing values in each column
    """
    data = df.copy()
    missing_val_frac = data.isna().sum() / len(data)
    missing_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            missing_val_frac,
            ax,
            title="Fraction of missing values in the dataset",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return missing_val_frac, fig

nunique(df, scale='linear', plot=False)

Procedure to plot features sorted by their number of unique values.

Parameters:

Name Type Description Default
df DataFrame

dataset

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
fig Figure

plot with sorted number of unique values in features

Source code in inference_model/eda/features/analysis.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def nunique(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their number of unique values.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted number of unique values in features
    """
    data = df.copy()

    data_nunique = data.nunique()
    data_nunique.sort_values(ascending=True, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            data_nunique.dropna(),
            ax,
            title="Features number of unique values (NA are dropped)",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return data_nunique, fig

std(df, scale='linear', plot=False)

Procedure to plot features sorted by their variance.

Parameters:

Name Type Description Default
df DataFrame

dataset

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
fig Figure

plot with sorted standard deviation of continuous features

Source code in inference_model/eda/features/analysis.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def std(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot features sorted by their variance.

    Args:
        df (DataFrame): dataset
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        fig (Figure): plot with sorted standard deviation of continuous features
    """
    data = df.copy()

    data_std = data.std()

    data_std.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(data_std.dropna(), ax, title="Features variance (NA are dropped)")
    else:
        fig = None
    ax.set_yscale(scale)
    return data_std, fig

zero(df, scale='linear', plot=False)

Procedure to check fraction of zero values in the dataset.

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
zero_val_frac DataFrame

sorted dataframe with fraction of '0' values per feature

fig Figure

plot with sorted fractions of '0' values in each column

Source code in inference_model/eda/features/analysis.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def zero(
    df: pd.DataFrame,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to check fraction of zero values in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        zero_val_frac (DataFrame): sorted dataframe with fraction of '0' values per
            feature
        fig (Figure): plot with sorted fractions of '0' values in each column
    """
    data = df.copy()
    zero_val_frac = data.isin([0]).sum() / len(data)
    zero_val_frac.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(zero_val_frac, ax, title="Fraction of zero values in the dataset")
    else:
        fig = None
    ax.set_yscale(scale)
    return zero_val_frac, fig

cross_correlation(df, n=10, verbose=False)

Procedure to calculate and plot cross-correlation of features in the dataset.

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
verbose bool

show n most correlated features

False
n int

number correlated features in verbose output

10

Returns:

Name Type Description
fig Figure

heatmap of continuous features cross correlations

Source code in inference_model/eda/features/plotting.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def cross_correlation(
    df: pd.DataFrame,
    n: int = 10,
    verbose: bool = False,
) -> Figure:
    """Procedure to calculate and plot cross-correlation of features
    in the dataset.

    Args:
        df (DataFrame): pandas dataframe
        verbose (bool): show n most correlated features
        n (int): number correlated features in verbose output

    Returns:
        fig (Figure): heatmap of continuous features cross correlations
    """
    data = df.copy()

    # corrwith uses numpy which does not like pandas Float dtypes
    corr_matrix = data.astype(float).corr()
    upper_stacked_sorted = (
        corr_matrix.abs()
        .where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
        .stack()
        .sort_values(ascending=False)
    )

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_title("Cross-correlation matrix", size=20)
    sns.set(font_scale=1.0)
    sns.heatmap(
        corr_matrix,
        ax=ax,
        cbar=True,
        square=True,
        # linewidths=0.5,
        fmt=".2f",
    )
    if verbose:
        print("Top {} absolute correlations".format(n))
        print(upper_stacked_sorted[:n])
    return fig

distributions(df, low_per_cut=0, high_per_cut=1, type='box')

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
low_per_cut float

lower percentile where to cut the plot for better readability

0
high_per_cut float

higher percentile where to cut the plot for better readability

1
type str

type of distribution plot

'box'

Returns:

Name Type Description
fig Figure

ditribution plot per each feature

Source code in inference_model/eda/features/plotting.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def distributions(
    df: pd.DataFrame,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
    type: Literal["box", "violin"] = "box",
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        low_per_cut (float): lower percentile where to cut the plot for better
            readability
        high_per_cut (float): higher percentile where to cut the plot for better
            readability
        type (str): type of distribution plot

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = df.copy()
    cols = data.columns.values
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle("Distribution of feature values", size=20)
    for col, i in zip(cols, range(len(cols))):
        if type == "violin":
            sns.violinplot(
                ax=ax[i],
                x=col,
                orient="h",
                # cut=0,
                # showextrem=False,
                data=data,
            )
        if type == "box":
            sns.boxplot(
                ax=ax[i],
                x=col,
                orient="h",
                data=data,
            )
        ax[i].set_xlim(
            data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
        )
        ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9)
    return fig

correlation(df, target, scale='linear', plot=False)

Procedure to plot most correlated numerical features with target column.

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
target Series

target values

required
scale str

y scale of the plot

'linear'
plot bool

whether to output the plot

False

Returns:

Name Type Description
sorted_corr_cols DataFrame

sorted dataframe with feature and target value correlation

fig Figure

sorted bar plot with feature and target value correlation

Source code in inference_model/eda/target/analysis.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def correlation(
    df: pd.DataFrame,
    target: pd.Series,
    scale: Literal["log", "linear"] = "linear",
    plot: bool = False,
) -> Tuple[pd.DataFrame, Figure]:
    """Procedure to plot most correlated numerical features with
    target column.

    Args:
        df (pd.DataFrame): pandas dataframe
        target (pd.Series): target values
        scale (str): y scale of the plot
        plot (bool): whether to output the plot

    Returns:
        sorted_corr_cols (pd.DataFrame): sorted dataframe with feature and target value
            correlation
        fig (Figure): sorted bar plot with feature and target value correlation
    """
    data = df.copy()
    target_col = target.name
    correlation_df = pd.concat([data, target], axis=1).corr()
    sorted_corr_cols = correlation_df[target_col].drop(index=[target_col])
    sorted_corr_cols.sort_values(ascending=False, inplace=True)

    if plot:
        fig, ax = plt.subplots(figsize=(10, 5))
        bar_plot(
            sorted_corr_cols,
            ax,
            title=f"Sorted features by their correlation with {target_col}",
        )
    else:
        fig = None
    ax.set_yscale(scale)
    return sorted_corr_cols, fig

distributions_in_binary_cls(df, target, low_per_cut=0, high_per_cut=1)

Procedure to plot distributions of the features splitted by column split_col using workaround for violinplots in seaborn: * https://stackoverflow.com/a/64787568/8147433 DISCALIMER: for now the cont_cols NA vals are filled with 0

Parameters:

Name Type Description Default
df DataFrame

pandas dataframe

required
cont_cols list

numerical features in the dataset

required
target Series

target values, i.e. binary classes

required

Returns:

Name Type Description
fig Figure

ditribution plot per each feature

Source code in inference_model/eda/target/plotting.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def distributions_in_binary_cls(
    df: pd.DataFrame,
    target: pd.Series,
    low_per_cut: float = 0,
    high_per_cut: float = 1,
) -> Figure:
    """Procedure to plot distributions of the features splitted
    by column split_col using workaround for violinplots in seaborn:
    * https://stackoverflow.com/a/64787568/8147433
    DISCALIMER: for now the cont_cols NA vals are filled with 0

    Args:
        df (DataFrame): pandas dataframe
        cont_cols (list): numerical features in the dataset
        target (pd.Series): target values, i.e. binary classes

    Returns:
        fig (Figure): ditribution plot per each feature
    """
    data = pd.concat([df, target], axis=1)
    cols = data.columns.values
    data["dummy"] = 0
    data = data.fillna(0).astype(float)
    fig, ax = plt.subplots(len(cols), 1, figsize=(16, len(cols) * 2))
    fig.suptitle(f"Distribution of feature values splitted by {target.name}", size=20)
    for col, i in zip(cols, range(len(cols))):
        if col != target.name:
            sns.violinplot(
                ax=ax[i],
                hue=target.name,
                x=col,
                y="dummy",
                orient="h",
                cut=0,
                showextrem=False,
                split=True,
                data=data,
            )
            ax[i].get_legend().remove()
            ax[i].set_xlim(
                data[col].quantile(low_per_cut), data[col].quantile(high_per_cut)
            )
            ax[i].get_yaxis().set_visible(False)
    plt.subplots_adjust(hspace=0.9, top=0.97)
    return fig

prob_distrib_per_class(predicted_probs, actual, task)

Procedure to plot probability density distributions per class from LightGBM predictions.

Parameters:

Name Type Description Default
predicted_probs ndarray

predicted probs

required
actual ndarray

ground truth classes

required
task str

type of task

required

Returns:

Name Type Description
fig Figure

probability density ditributions plot per each class

Source code in inference_model/eda/target/plotting.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def prob_distrib_per_class(
    predicted_probs: np.ndarray,
    actual: np.ndarray,
    task: Literal["binary", "multiclass"],
) -> Figure:
    """Procedure to plot probability density distributions per class from LightGBM
    predictions.

    Args:
        predicted_probs (ndarray): predicted probs
        actual (ndarray): ground truth classes
        task (str): type of task

    Returns:
        fig (Figure): probability density ditributions plot per each class
    """
    if task == "binary":
        temp = pd.DataFrame({"predicted_proba": predicted_probs, "actual": actual})

        fig, ax = plt.subplots(figsize=(3, 2))
        fig.suptitle("Predicted probability density per class")
        for label, alpha, color in zip([0, 1], [1, 0.7], ["black", "red"]):
            ax.hist(
                temp[temp["actual"] == label]["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                alpha=alpha,
                color=color,
                ec="k",
            )
        ax.set_xlim([0, 1])
        ax.set_xlabel("probability")
        ax.set_ylabel("probability_density")
    elif task == "multiclass":
        temp = pd.DataFrame(
            {
                "predicted_proba": np.take_along_axis(
                    predicted_probs, np.vstack(actual), axis=1  # type: ignore
                ).flatten(),
                "actual": actual,
            }
        )

        # Dictionary of color for each label
        color_d = dict(
            zip_longest(
                temp["actual"].unique(),
                plt.rcParams["axes.prop_cycle"].by_key()["color"],
            )
        )

        n_classes = temp["actual"].nunique()
        fig, ax = plt.subplots(
            ncols=n_classes, figsize=(n_classes * 3, 2), sharex=True, sharey=True
        )
        fig.suptitle("Predicted probability density per class")
        plt.subplots_adjust(wspace=0.3, top=0.75)

        for i, (label, gp) in enumerate(temp.groupby("actual")):
            ax[i].hist(
                gp["predicted_proba"],
                density=True,
                bins=np.linspace(0, 1, 10),
                color=color_d[label],
                ec="k",
            )
            ax[i].set_title(label)
            ax[i].set_xlim([0, 1])
            ax[i].set_xlabel("probability")
            ax[i].set_ylabel("probability_density")
    return fig

bar_plot(df, ax, title)

Helper method for unified bar plot

Parameters:

Name Type Description Default
df DataFrame

dataframe to plot

required
ax Axes

axes defining where to plot it

required

Returns:

Name Type Description
adjusted_plot Axes

adjusted axes

Source code in inference_model/eda/plotting.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def bar_plot(df: pd.DataFrame, ax: Axes, title: str) -> Axes:
    """Helper method for unified bar plot

    Args:
        df (DataFrame): dataframe to plot
        ax (Axes): axes defining where to plot it

    Returns:
        adjusted_plot (Axes): adjusted axes
    """
    data = df.copy()
    ax.set_title(title, size=20)
    ax.tick_params(labelsize=8)
    ax.set_xlabel("sorted features", size=12)
    return data.plot.bar(ax=ax)

entropy_calc(labels, base=np.e)

Computes entropy of both continuous and categorical features. Shamelessly stolen from : https://stackoverflow.com/a/45091961 Args: labels (list, ndarray, Series): list of values Returns: ent (float): entropy of the list of values

Source code in inference_model/eda/general_utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def entropy_calc(labels: list, base: float = np.e) -> float:
    """Computes entropy of both continuous and categorical features.
    Shamelessly stolen from :
    https://stackoverflow.com/a/45091961
    Args:
        labels (list, ndarray, Series): list of values
    Returns:
        ent (float): entropy of the list of values
    """
    n_labels = len(labels)
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    ent = float(0)
    if n_classes > 1:
        for i in probs:
            ent -= i * math.log(i, base)
    return ent

intsec(list1, list2)

Simple intesection of two lists. Args: list1 (list): list1 list2 (list): list2 Returns: list (list): intersection of lists

Source code in inference_model/eda/general_utils.py
 6
 7
 8
 9
10
11
12
13
14
def intsec(list1: list, list2: list) -> list:
    """Simple intesection of two lists.
    Args:
        list1 (list): list1
        list2 (list): list2
    Returns:
        list (list): intersection of lists
    """
    return list(set.intersection(set(list1), set(list2)))