Skip to content

Preprocess data

LabelEncoder

Bases: object

Label Encode categorical values for multiple columns at once

ℹ️ NOTE: Shamlessly copied from https://github.com/jrzaurin/pytorch-widedeep

ℹ️ NOTE: LabelEncoder reserves 0 for unseen new categories. This is convenient when defining the embedding layers, since we can just set padding idx to 0.

Parameters:

Name Type Description Default
columns_to_encode list, Optional, default = None

List of strings containing the names of the columns to encode. If None all columns of type object in the dataframe will be label encoded.

None

Attributes:

Name Type Description
encoding_dict Dict

Dictionary containing the encoding mappings in the format, e.g. :
{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...} # noqa

inverse_encoding_dict(Dict) Dict

Dictionary containing the inverse encoding mappings in the format, e.g. :
{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...} # noqa

Source code in inference_model/preprocessing/label_encoder.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class LabelEncoder(object):
    r"""Label Encode categorical values for multiple columns at once

    :information_source: **NOTE**:
    Shamlessly copied from https://github.com/jrzaurin/pytorch-widedeep

    :information_source: **NOTE**:
    LabelEncoder reserves 0 for `unseen` new categories. This is convenient
    when defining the embedding layers, since we can just set padding idx to 0.

    Parameters:
        columns_to_encode (list, Optional, default = None): List of strings containing
            the names of the columns to encode. If `None` all columns of type `object`
            in the dataframe will be label encoded.

    Attributes:
        encoding_dict (Dict): Dictionary containing the encoding mappings in the format,
            e.g. : <br/> `{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}`  # noqa
        inverse_encoding_dict(Dict): Dictionary containing the inverse encoding mappings
            in the format, e.g. : <br/> `{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}`  # noqa
    """

    def __init__(
        self,
        columns_to_encode: Optional[List[str]] = None,
    ):
        self.columns_to_encode = columns_to_encode

    def fit(self, df: pd.DataFrame) -> "LabelEncoder":
        """Creates encoding attributes

        Returns:
            LabelEncoder: `LabelEncoder` fitted object
        """

        df_inp = df.copy()

        if self.columns_to_encode is None:
            self.columns_to_encode = list(
                df_inp.select_dtypes(include=["object"]).columns
            )
        else:
            # sanity check to make sure all categorical columns are in an adequate
            # format
            for col in self.columns_to_encode:
                df_inp[col] = df_inp[col].astype("O")

        unique_column_vals = dict()
        for c in self.columns_to_encode:
            unique_column_vals[c] = df_inp[c].unique()

        self.encoding_dict = dict()

        # leave 0 for padding/"unseen" categories
        idx = 1
        for k, v in unique_column_vals.items():
            self.encoding_dict[k] = {
                o: i + idx for i, o in enumerate(unique_column_vals[k])
            }
            idx = 1

        self.inverse_encoding_dict = dict()
        for c in self.encoding_dict:
            self.inverse_encoding_dict[c] = {
                v: k for k, v in self.encoding_dict[c].items()
            }
            self.inverse_encoding_dict[c][0] = "unseen"

        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Label Encoded the categories in `columns_to_encode`

        Returns:
            pd.DataFrame: label-encoded dataframe
        """
        try:
            self.encoding_dict
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )

        df_inp = df.copy()
        # sanity check to make sure all categorical columns are in an adequate
        # format
        for col in self.columns_to_encode:  # type: ignore
            df_inp[col] = df_inp[col].astype("O")

        for k, v in self.encoding_dict.items():
            df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)

        return df_inp

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Combines `fit` and `transform`

        Returns:
            pd.DataFrame: label-encoded dataframe

        Examples:
            >>> import pandas as pd
            >>> from data_preparation.label_encoder import LabelEncoder
            >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
            >>> columns_to_encode = ['col2']
            >>> encoder = LabelEncoder(columns_to_encode)
            >>> encoder.fit_transform(df)
               col1  col2
            0     1     1
            1     2     2
            2     3     3
            >>> encoder.encoding_dict
            {'col2': {'me': 1, 'you': 2, 'him': 3}}
        """
        return self.fit(df).transform(df)

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Returns the original categories

        Returns:
            pd.DataFrame: label-encoded dataframe

        Examples:
            >>> import pandas as pd
            >>> from data_preparation.label_encoder import LabelEncoder
            >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
            >>> columns_to_encode = ['col2']
            >>> encoder = LabelEncoder(columns_to_encode)
            >>> df_enc = encoder.fit_transform(df)
            >>> encoder.inverse_transform(df_enc)
               col1 col2
            0     1   me
            1     2  you
            2     3  him
        """
        for k, v in self.inverse_encoding_dict.items():
            df[k] = df[k].apply(lambda x: v[x])
        return df

fit(df)

Creates encoding attributes

Returns:

Name Type Description
LabelEncoder LabelEncoder

LabelEncoder fitted object

Source code in inference_model/preprocessing/label_encoder.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def fit(self, df: pd.DataFrame) -> "LabelEncoder":
    """Creates encoding attributes

    Returns:
        LabelEncoder: `LabelEncoder` fitted object
    """

    df_inp = df.copy()

    if self.columns_to_encode is None:
        self.columns_to_encode = list(
            df_inp.select_dtypes(include=["object"]).columns
        )
    else:
        # sanity check to make sure all categorical columns are in an adequate
        # format
        for col in self.columns_to_encode:
            df_inp[col] = df_inp[col].astype("O")

    unique_column_vals = dict()
    for c in self.columns_to_encode:
        unique_column_vals[c] = df_inp[c].unique()

    self.encoding_dict = dict()

    # leave 0 for padding/"unseen" categories
    idx = 1
    for k, v in unique_column_vals.items():
        self.encoding_dict[k] = {
            o: i + idx for i, o in enumerate(unique_column_vals[k])
        }
        idx = 1

    self.inverse_encoding_dict = dict()
    for c in self.encoding_dict:
        self.inverse_encoding_dict[c] = {
            v: k for k, v in self.encoding_dict[c].items()
        }
        self.inverse_encoding_dict[c][0] = "unseen"

    return self

fit_transform(df)

Combines fit and transform

Returns:

Type Description
DataFrame

pd.DataFrame: label-encoded dataframe

Examples:

>>> import pandas as pd
>>> from data_preparation.label_encoder import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> encoder.fit_transform(df)
   col1  col2
0     1     1
1     2     2
2     3     3
>>> encoder.encoding_dict
{'col2': {'me': 1, 'you': 2, 'him': 3}}
Source code in inference_model/preprocessing/label_encoder.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Combines `fit` and `transform`

    Returns:
        pd.DataFrame: label-encoded dataframe

    Examples:
        >>> import pandas as pd
        >>> from data_preparation.label_encoder import LabelEncoder
        >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
        >>> columns_to_encode = ['col2']
        >>> encoder = LabelEncoder(columns_to_encode)
        >>> encoder.fit_transform(df)
           col1  col2
        0     1     1
        1     2     2
        2     3     3
        >>> encoder.encoding_dict
        {'col2': {'me': 1, 'you': 2, 'him': 3}}
    """
    return self.fit(df).transform(df)

inverse_transform(df)

Returns the original categories

Returns:

Type Description
DataFrame

pd.DataFrame: label-encoded dataframe

Examples:

>>> import pandas as pd
>>> from data_preparation.label_encoder import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> df_enc = encoder.fit_transform(df)
>>> encoder.inverse_transform(df_enc)
   col1 col2
0     1   me
1     2  you
2     3  him
Source code in inference_model/preprocessing/label_encoder.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Returns the original categories

    Returns:
        pd.DataFrame: label-encoded dataframe

    Examples:
        >>> import pandas as pd
        >>> from data_preparation.label_encoder import LabelEncoder
        >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
        >>> columns_to_encode = ['col2']
        >>> encoder = LabelEncoder(columns_to_encode)
        >>> df_enc = encoder.fit_transform(df)
        >>> encoder.inverse_transform(df_enc)
           col1 col2
        0     1   me
        1     2  you
        2     3  him
    """
    for k, v in self.inverse_encoding_dict.items():
        df[k] = df[k].apply(lambda x: v[x])
    return df

transform(df)

Label Encoded the categories in columns_to_encode

Returns:

Type Description
DataFrame

pd.DataFrame: label-encoded dataframe

Source code in inference_model/preprocessing/label_encoder.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Label Encoded the categories in `columns_to_encode`

    Returns:
        pd.DataFrame: label-encoded dataframe
    """
    try:
        self.encoding_dict
    except AttributeError:
        raise NotFittedError(
            "This LabelEncoder instance is not fitted yet. "
            "Call 'fit' with appropriate arguments before using this LabelEncoder."
        )

    df_inp = df.copy()
    # sanity check to make sure all categorical columns are in an adequate
    # format
    for col in self.columns_to_encode:  # type: ignore
        df_inp[col] = df_inp[col].astype("O")

    for k, v in self.encoding_dict.items():
        df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)

    return df_inp

drop_constant_cols(df, verbose=False)

Returns dataframe without constant columns, i.e. those with just 1 unique value for all rows.

Source code in inference_model/preprocessing/preprocess_data.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def drop_constant_cols(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """Returns dataframe without constant columns, i.e. those with just 1 unique
    value for all rows."""
    nunique_per_col = df.apply(lambda x: x.nunique(), axis=0)
    const_cols = nunique_per_col[nunique_per_col == 1].index.values
    df = df.drop(const_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} constant columns.
            Affected columns: {}
            """.format(
                len(const_cols), const_cols
            )
        )
    return df

drop_high_nan_cols(df, threshold=0.8, verbose=False)

Returns dataframe without columns that have ratio of missingness above threshold.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
threshold float = 0.8

ratio of missingness applied per column

0.8
verbose bool

whether the output should be verbose

False
Source code in inference_model/preprocessing/preprocess_data.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def drop_high_nan_cols(
    df: pd.DataFrame, threshold: float = 0.8, verbose: bool = False
) -> pd.DataFrame:
    """Returns dataframe without columns that have ratio of missingness above threshold.

    Args:
        df (pd.DataFrame): input dataframe
        threshold (float = 0.8): ratio of missingness applied per column
        verbose (bool): whether the output should be verbose
    """
    n_rows = df.shape[0]
    nan_fraction_per_col = df.apply(lambda x: x.isna().sum() / n_rows, axis=0)
    high_nan_percentage_cols = nan_fraction_per_col[
        nan_fraction_per_col > threshold
    ].index.values
    df = df.drop(high_nan_percentage_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} columns with fraction of NaN above threshold = {}.
            Affected columns: {}
            """.format(
                len(high_nan_percentage_cols), threshold, high_nan_percentage_cols
            )
        )
    return df

drop_high_uq_cat_cols(df, cat_cols, uq_val_count, verbose=False)

Returns dataframe without categorical columns that have too many unique values

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
cat_cols list

list of categorical columns

required
uq_val_count int

unique value count

required
verbose bool

whether the output should be verbose

False
Source code in inference_model/preprocessing/preprocess_data.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def drop_high_uq_cat_cols(
    df: pd.DataFrame, cat_cols: list, uq_val_count: int, verbose: bool = False
) -> pd.DataFrame:
    """Returns dataframe without categorical columns that have too many unique values

    Args:
        df (pd.DataFrame): input dataframe
        cat_cols (list): list of categorical columns
        uq_val_count (int): unique value count
        verbose (bool): whether the output should be verbose
    """
    nunique_per_col = df[cat_cols].apply(lambda x: x.nunique(), axis=0)
    high_uq_value_cat_cols = nunique_per_col[
        nunique_per_col > uq_val_count
    ].index.values
    df = df.drop(high_uq_value_cat_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} high unique value columns.
            Affected columns: {}
            """.format(
                len(high_uq_value_cat_cols), high_uq_value_cat_cols
            )
        )
    return df

drop_highly_correlated_columns(df, cont_cols, crosscorr_val=0.95, verbose=False)

Returns dataframe without highly correlated columns, cross correlation is evaluated with crosscorr_val.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
cont_cols list

list of columns to evaluate correlation for

required
crosscorr_val float = 0.95

threshold value of correlation

0.95
verbose bool

whether the output should be verbose

False
Source code in inference_model/preprocessing/preprocess_data.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def drop_highly_correlated_columns(
    df: pd.DataFrame,
    cont_cols: list,
    crosscorr_val: float = 0.95,
    verbose: bool = False,
) -> pd.DataFrame:
    """Returns dataframe without highly correlated columns, cross correlation is
    evaluated with crosscorr_val.

    Args:
        df (pd.DataFrame): input dataframe
        cont_cols (list): list of columns to evaluate correlation for
        crosscorr_val (float = 0.95): threshold value of correlation
        verbose (bool): whether the output should be verbose
    """
    corr_matrix = df[cont_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
    upper_above = upper.apply(lambda x: any(x > crosscorr_val), axis=1)
    to_drop = upper_above[upper_above].index.values
    df = df.drop(to_drop, axis=1)
    if verbose:
        print(
            """
            Dropped {} highly correlated columns.
            Affected columns: {}
            """.format(
                len(to_drop), to_drop
            )
        )
    return df

most_frequent_in_list_col(dfs)

Returns pd.Series with the most frequent values in the lists.

Parameters:

Name Type Description Default
dfs Series

input pandas series containing string list values

required
Source code in inference_model/preprocessing/preprocess_data.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def most_frequent_in_list_col(dfs: pd.Series):
    """Returns pd.Series with the most frequent values in the lists.

    Args:
        dfs (pd.Series): input pandas series containing string list values
    """
    dfsc = dfs.copy()
    dfsc.apply(
        lambda string: _most_frequent(
            list(string.replace("[", "").replace("]", "").split(", "))
        )
        if isinstance(string, str)
        else string
    )
    return dfsc

nan_with_number_imputer(df, columns, fill_number=-1.0, verbose=False)

Fills NAs with surrogate float, -1 is default value used, it can be customized.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
columns List[str]

list of columns that will be filled

required
fill_number float = -1

number used to replace NAs. Defaults to

-1.0
verbose bool

whether the output should be verbose

False
Source code in inference_model/preprocessing/preprocess_data.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def nan_with_number_imputer(
    df: pd.DataFrame,
    columns: List[str],
    fill_number: float = -1.0,
    verbose: bool = False,
) -> pd.DataFrame:
    """Fills NAs with surrogate float, -1 is default value used, it can be customized.

    Args:
        df (pd.DataFrame): input dataframe
        columns (List[str]): list of columns that will be filled
        fill_number (float = -1): number used to replace NAs. Defaults to
        verbose (bool): whether the output should be verbose
    """
    dfc = df.copy()
    if verbose:
        sum_nan_vals = df[columns].isna().sum()
        nans_cols = []
        for c in columns:
            nans_cols.append((c, df[c].isna().sum()))
    dfc = df.copy()
    dfc[columns] = dfc[columns].apply(lambda x: x.astype(float).fillna(fill_number))
    if verbose:
        print(
            """
            Imputed {} NaN values with {}.
            Affected columns (col, num_NaNs): {}
            """.format(
                sum_nan_vals, fill_number, nans_cols
            )
        )
    return dfc

nan_with_unknown_imputer(df, columns, fill_token='unknown', verbose=False)

Fills NAs with surrogate string, 'unknown' is default value used, it can be customized.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
columns List[str]

ist of columns that will be filled

required
fillna_token str = "unknown"

string used to replace NAs

required
verbose bool

whether the output should be verbose

False
Source code in inference_model/preprocessing/preprocess_data.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def nan_with_unknown_imputer(
    df: pd.DataFrame,
    columns: List[str],
    fill_token: str = "unknown",
    verbose: bool = False,
) -> pd.DataFrame:
    """Fills NAs with surrogate string, 'unknown' is default value used, it can be customized.

    Args:
        df (pd.DataFrame): input dataframe
        columns (List[str]): ist of columns that will be filled
        fillna_token (str = "unknown"): string used to replace NAs
        verbose (bool): whether the output should be verbose
    """
    df = df.copy()
    if verbose:
        sum_nan_vals = df[columns].isna().sum()
        nans_cols = []
        for c in columns:
            nans_cols.append((c, df[c].isna().sum()))
    for c in columns:
        df[c] = df[c].astype(object).fillna(fill_token)

    dfc = df.copy()
    dfc[columns] = dfc[columns].apply(lambda x: x.astype(object).fillna(fill_token))
    if verbose:
        print(
            """
            Imputed {} NaN values with {}.
            Affected columns (col, num_NaNs): {}
            """.format(
                sum_nan_vals, fill_token, nans_cols
            )
        )
    return df

nuq_in_list_col(dfs)

Returns pd.Series with the number of unique values in the lists.

Parameters:

Name Type Description Default
dfs Series

input pandas series containing string list values

required
Source code in inference_model/preprocessing/preprocess_data.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def nuq_in_list_col(dfs: pd.Series):
    """Returns pd.Series with the number of unique values in the lists.

    Args:
        dfs (pd.Series): input pandas series containing string list values
    """
    dfsc = dfs.copy()
    dfsc.apply(
        lambda string: _nunique(
            list(string.replace("[", "").replace("]", "").split(", "))
        )
        if isinstance(string, str)
        else string
    )
    return dfsc

replace_rare_categories_with_str_other(df, categorical_cols, quantile=0.05, surrogate_value='other', verbose=False)

Replaces rare category value with surrogate string.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
categorical_cols List[str]

list of columns in dataframe to process.

required
quantile float = 0.05

determines what values are considered as rare

0.05
surrogate_value str = "other"

string used to replace rare values

'other'

Returns:

Type Description
Tuple[DataFrame, Dict]

Tuple[pd.DataFrame, Dict]: New dataframe and a dict. with mapping between orig. and surrogate values.

Source code in inference_model/preprocessing/preprocess_data.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def replace_rare_categories_with_str_other(
    df: pd.DataFrame,
    categorical_cols: List[str],
    quantile: float = 0.05,
    surrogate_value: str = "other",
    verbose: bool = False,
) -> Tuple[pd.DataFrame, Dict]:
    """Replaces rare category value with surrogate string.

    Args:
        df (pd.DataFrame): input dataframe
        categorical_cols (List[str]): list of columns in dataframe to process.
        quantile (float = 0.05): determines what values are considered as rare
        surrogate_value (str = "other"): string used to replace rare values

    Returns:
        Tuple[pd.DataFrame, Dict]:
            New dataframe and a dict. with mapping between orig. and surrogate values.
    """
    if surrogate_value in df[categorical_cols].values:
        raise ValueError(
            "Surrogate string - "
            + surrogate_value
            + " - is already present, choose another one."
        )

    dfc = df.copy()
    replace_dict_per_col = (
        df[categorical_cols]
        .apply(_replace_rare_dict, args=[surrogate_value, quantile])
        .to_dict()
    )
    dfc = dfc.replace(replace_dict_per_col)
    if verbose:
        sum_rare_cats = sum(
            [len(replace_dict_per_col[col]) for col in replace_dict_per_col]
        )
        rare_cats_per_col = [
            (col, list(replace_dict_per_col[col].keys()))
            for col in replace_dict_per_col
        ]
        print(
            """
            Replaced {} rare categories (val_count < {}) with {}.
            Affected columns (List[(col, rare_cats)]): {}
            """.format(
                sum_rare_cats,
                quantile,
                surrogate_value,
                rare_cats_per_col,
            )
        )
    return dfc, replace_dict_per_col

PreprocessData

Object to preprocess the dataset. Args: target_col (str): target column name id_cols (List[str]): id columns cat_cols (Optional[List[str]]): list of categorical column names cont_cols (Optional[List[str]]): list of continuous column names

Source code in inference_model/preprocessing/preprocess.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class PreprocessData:
    """Object to preprocess the dataset.
    Args:
        target_col (str): target column name
        id_cols (List[str]): id columns
        cat_cols (Optional[List[str]]): list of categorical column names
        cont_cols (Optional[List[str]]): list of continuous column names
    """

    def __init__(
        self,
        target_col: str,
        id_cols: str,
        cat_cols: Optional[List[str]] = None,
        cont_cols: Optional[List[str]] = None,
    ):
        self.target_col = target_col
        self.id_cols = id_cols
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols

        self.is_fitted = False

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fit peprocessor and transform dataset in training step."""

        dfc = df.drop(columns=self.id_cols).copy()

        dfc = dfc.pipe(drop_constant_cols)

        self.cat_cols = intsec(list(dfc), self.cat_cols)

        if self.cat_cols is None:
            self.cat_cols = self._infere_cat_cols(dfc)
            dfc_cat_cols = drop_high_nan_cols(dfc[self.cat_cols])
            dfc = dfc.drop(columns=self.cat_cols)
            dfc = pd.concat([dfc, dfc_cat_cols], axis=1)
            self.cat_cols = dfc_cat_cols.columns.values.tolist()

        if self.cont_cols is None:
            self.init_cont_cols = self._infere_cont_cols(dfc, self.cat_cols)
            dfc_init_cont_cols = drop_high_nan_cols(dfc[self.init_cont_cols])
            dfc = dfc.drop(columns=self.init_cont_cols)
            dfc = pd.concat([dfc, dfc_init_cont_cols], axis=1)
            self.init_cont_cols = dfc_init_cont_cols.columns.values.tolist()

            dfc = nan_with_number_imputer(dfc, self.init_cont_cols, -9999.0)

            (
                dfc,
                self.final_cont_cols,
            ) = self._drop_highly_corr_cols_and_get_final_cont_cols(
                dfc, self.init_cont_cols
            )

        else:
            self.final_cont_cols = list(
                set(self.cont_cols).intersection(set(dfc.columns.values))
            )

            dfc = nan_with_number_imputer(dfc, self.final_cont_cols, -9999.0)

        dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

        dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]

        self.label_encoder = LabelEncoder(self.cat_cols)
        dfc_le = self.label_encoder.fit_transform(dfc)

        dfc_le = self._change_int_float_types(dfc_le)

        dfc_le[self.id_cols] = df[self.id_cols]

        self.is_fitted = True

        return dfc_le

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform dataset in inference step."""

        if not self.is_fitted:
            raise NotFittedError(
                """This instance of 'PreprocessData' has not been fitted yet.
                Please, run 'fit' first"""
            )

        dfc = df.drop(columns=self.id_cols).copy()

        # added as mlflow inference is receiving all the data as objects
        dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
        dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)

        try:
            dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
        except KeyError:
            dfc = dfc[self.cat_cols + self.final_cont_cols]

        # dfc = dfc.replace(self.replace_rare_categories_dict)

        dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

        dfc = nan_with_number_imputer(
            dfc,
            list(set(self.final_cont_cols)),
            -9999.0,  # noqa
        )

        dfc_le = self.label_encoder.transform(dfc)

        dfc_le = self._change_int_float_types(dfc_le)

        dfc_le[self.id_cols] = df[self.id_cols]

        return dfc_le

    def fit(self, df: pd.DataFrame) -> pd.DataFrame:
        """Just to keep familiar naming convention with sklearn."""
        return self.fit_transform(df)

    def _drop_highly_corr_cols_and_get_final_cont_cols(
        self, df: pd.DataFrame, cont_cols: List[str]
    ) -> Tuple[pd.DataFrame, List[str]]:
        """Drop highly correlated columns to decrease the size of the dataset
        by dropping redundant information."""

        df = drop_highly_correlated_columns(df, cont_cols)

        final_cont_cols = [
            c for c in df.columns if c not in self.cat_cols + [self.target_col]
        ]

        return df, final_cont_cols

    def _infere_cat_cols(self, df: pd.DataFrame):
        """Guess the categorical columns by excluding clearly continuous
        columns - int, float"""

        cat_cols = []
        for col in df.columns:
            if (
                df[col].dtype not in ["int32", "int64"]
                and df[col].dtype not in ["float32", "float64"]
                and col not in [self.target_col]
            ):
                cat_cols.append(col)
        return cat_cols

    def _infere_cont_cols(
        self, df: pd.DataFrame, cat_cols: Optional[List[str]]
    ):  # noqa
        """Guess the continuous columns by excluding clearly categorical,
        and target_col and including only int or float."""

        if cat_cols is not None:
            cont_cols = [c for c in df.columns if c not in cat_cols + [self.target_col]]
        else:
            cont_cols = []
            for col in df.columns:
                if (df[col].dtype == "int" or df[col].dtype == "float") and col not in [
                    self.target_col
                ]:
                    cont_cols.append(col)

        return cont_cols

    def _change_int_float_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Change int to float as int causes data type issues in some ml
        methods, eg. lightgbm."""
        dfc = df.copy()
        dfc = dfc.astype(
            dict.fromkeys(dfc.select_dtypes(np.int64).columns, np.int32)
        )  # noqa
        dfc = dfc.astype(
            dict.fromkeys(dfc.select_dtypes(np.float64).columns, np.float32)
        )

        return dfc

fit(df)

Just to keep familiar naming convention with sklearn.

Source code in inference_model/preprocessing/preprocess.py
134
135
136
def fit(self, df: pd.DataFrame) -> pd.DataFrame:
    """Just to keep familiar naming convention with sklearn."""
    return self.fit_transform(df)

fit_transform(df)

Fit peprocessor and transform dataset in training step.

Source code in inference_model/preprocessing/preprocess.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Fit peprocessor and transform dataset in training step."""

    dfc = df.drop(columns=self.id_cols).copy()

    dfc = dfc.pipe(drop_constant_cols)

    self.cat_cols = intsec(list(dfc), self.cat_cols)

    if self.cat_cols is None:
        self.cat_cols = self._infere_cat_cols(dfc)
        dfc_cat_cols = drop_high_nan_cols(dfc[self.cat_cols])
        dfc = dfc.drop(columns=self.cat_cols)
        dfc = pd.concat([dfc, dfc_cat_cols], axis=1)
        self.cat_cols = dfc_cat_cols.columns.values.tolist()

    if self.cont_cols is None:
        self.init_cont_cols = self._infere_cont_cols(dfc, self.cat_cols)
        dfc_init_cont_cols = drop_high_nan_cols(dfc[self.init_cont_cols])
        dfc = dfc.drop(columns=self.init_cont_cols)
        dfc = pd.concat([dfc, dfc_init_cont_cols], axis=1)
        self.init_cont_cols = dfc_init_cont_cols.columns.values.tolist()

        dfc = nan_with_number_imputer(dfc, self.init_cont_cols, -9999.0)

        (
            dfc,
            self.final_cont_cols,
        ) = self._drop_highly_corr_cols_and_get_final_cont_cols(
            dfc, self.init_cont_cols
        )

    else:
        self.final_cont_cols = list(
            set(self.cont_cols).intersection(set(dfc.columns.values))
        )

        dfc = nan_with_number_imputer(dfc, self.final_cont_cols, -9999.0)

    dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

    dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]

    self.label_encoder = LabelEncoder(self.cat_cols)
    dfc_le = self.label_encoder.fit_transform(dfc)

    dfc_le = self._change_int_float_types(dfc_le)

    dfc_le[self.id_cols] = df[self.id_cols]

    self.is_fitted = True

    return dfc_le

transform(df)

Transform dataset in inference step.

Source code in inference_model/preprocessing/preprocess.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataset in inference step."""

    if not self.is_fitted:
        raise NotFittedError(
            """This instance of 'PreprocessData' has not been fitted yet.
            Please, run 'fit' first"""
        )

    dfc = df.drop(columns=self.id_cols).copy()

    # added as mlflow inference is receiving all the data as objects
    dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
    dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)

    try:
        dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
    except KeyError:
        dfc = dfc[self.cat_cols + self.final_cont_cols]

    # dfc = dfc.replace(self.replace_rare_categories_dict)

    dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

    dfc = nan_with_number_imputer(
        dfc,
        list(set(self.final_cont_cols)),
        -9999.0,  # noqa
    )

    dfc_le = self.label_encoder.transform(dfc)

    dfc_le = self._change_int_float_types(dfc_le)

    dfc_le[self.id_cols] = df[self.id_cols]

    return dfc_le

scaler_mapper(cont_cols, cat_cols, id_cols, scaler_mapper_def=None)

Function that maps scaler functions to appropriate columns.

By default does not assign any scaler to continuous, categorical or identifier columns. The scalers must be set in scaler_mapper_def. Use sklearn scalers. Only columns defined in mapper object will be present in the transformed dataset.

Parameters:

Name Type Description Default
cont_cols list

list of continuous feature columns in the dataset

required
cat_cols list

list of categorical feature columns in the dataset

required
id_cols list

identifier columns

required
scaler_mapper_def dict

optional dictionary that contains keys ['cont_cols', 'cat_cols', 'id_cols'] with their corresponding scalers (defined by names, not instantiated) from sklearn library

None
Source code in inference_model/preprocessing/scaler.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def scaler_mapper(
    cont_cols: List[str],
    cat_cols: List[str],
    id_cols: List[str],
    scaler_mapper_def: Optional[dict] = None,
) -> DataFrameMapper:
    """Function that maps scaler functions to appropriate columns.

    By default does not assign any scaler to continuous, categorical or
    identifier columns. The scalers must be set in scaler_mapper_def. Use sklearn scalers.
    Only columns defined in mapper object will be present in the transformed dataset.

    Args:
        cont_cols (list): list of continuous feature columns in the dataset
        cat_cols (list): list of categorical feature columns in the dataset
        id_cols (list): identifier columns
        scaler_mapper_def (dict): optional dictionary that contains keys
            ['cont_cols', 'cat_cols', 'id_cols'] with their corresponding
            scalers (defined by names, not instantiated) from sklearn library
    Returns:
        scaler (DataFrameMapper): scaler object mapping sklearn scalers to columns in
            pandas dataframe
    """
    if scaler_mapper_def:
        cont_cols_def = gen_features(
            columns=list(map(lambda x: [x], cont_cols)),
            classes=[scaler_mapper_def["cont_cols"]],
        )

        cat_cols_def = gen_features(
            columns=list(map(lambda x: [x], cat_cols)),
            classes=[scaler_mapper_def["cat_cols"]],
        )

        id_cols_def = gen_features(
            columns=list(map(lambda x: [x], id_cols)),
            classes=[scaler_mapper_def["id_cols"]],
        )

    else:
        cont_cols_def = gen_features(
            columns=list(map(lambda x: [x], cont_cols)), classes=[None]
        )

        cat_cols_def = gen_features(
            columns=list(map(lambda x: [x], cat_cols)), classes=[None]
        )

        id_cols_def = gen_features(
            columns=list(map(lambda x: [x], id_cols)), classes=[None]
        )

    scaler = DataFrameMapper(cont_cols_def + cat_cols_def + id_cols_def, df_out=True)
    return scaler