Preprocess data¶

`LabelEncoder` ¶

Bases: object

Label Encode categorical values for multiple columns at once

NOTE: Shamlessly copied from https://github.com/jrzaurin/pytorch-widedeep

NOTE: LabelEncoder reserves 0 for unseen new categories. This is convenient when defining the embedding layers, since we can just set padding idx to 0.

Parameters:

Name	Type	Description	Default
`columns_to_encode`	`list, Optional, default = None`	List of strings containing the names of the columns to encode. If `None` all columns of type `object` in the dataframe will be label encoded.	`None`

Attributes:

Name	Type	Description
`encoding_dict`	`Dict`	Dictionary containing the encoding mappings in the format, e.g. : `{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}` # noqa
`inverse_encoding_dict(Dict)`	`Dict`	Dictionary containing the inverse encoding mappings in the format, e.g. : `{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}` # noqa

Source code in inference_model/preprocessing/label_encoder.py

class LabelEncoder(object):
    r"""Label Encode categorical values for multiple columns at once

    :information_source: **NOTE**:
    Shamlessly copied from https://github.com/jrzaurin/pytorch-widedeep

    :information_source: **NOTE**:
    LabelEncoder reserves 0 for `unseen` new categories. This is convenient
    when defining the embedding layers, since we can just set padding idx to 0.

    Parameters:
        columns_to_encode (list, Optional, default = None): List of strings containing
            the names of the columns to encode. If `None` all columns of type `object`
            in the dataframe will be label encoded.

    Attributes:
        encoding_dict (Dict): Dictionary containing the encoding mappings in the format,
            e.g. : <br/> `{'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}`  # noqa
        inverse_encoding_dict(Dict): Dictionary containing the inverse encoding mappings
            in the format, e.g. : <br/> `{'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}`  # noqa
    """

    def __init__(
        self,
        columns_to_encode: Optional[List[str]] = None,
    ):
        self.columns_to_encode = columns_to_encode

    def fit(self, df: pd.DataFrame) -> "LabelEncoder":
        """Creates encoding attributes

        Returns:
            LabelEncoder: `LabelEncoder` fitted object
        """

        df_inp = df.copy()

        if self.columns_to_encode is None:
            self.columns_to_encode = list(
                df_inp.select_dtypes(include=["object"]).columns
            )
        else:
            # sanity check to make sure all categorical columns are in an adequate
            # format
            for col in self.columns_to_encode:
                df_inp[col] = df_inp[col].astype("O")

        unique_column_vals = dict()
        for c in self.columns_to_encode:
            unique_column_vals[c] = df_inp[c].unique()

        self.encoding_dict = dict()

        # leave 0 for padding/"unseen" categories
        idx = 1
        for k, v in unique_column_vals.items():
            self.encoding_dict[k] = {
                o: i + idx for i, o in enumerate(unique_column_vals[k])
            }
            idx = 1

        self.inverse_encoding_dict = dict()
        for c in self.encoding_dict:
            self.inverse_encoding_dict[c] = {
                v: k for k, v in self.encoding_dict[c].items()
            }
            self.inverse_encoding_dict[c][0] = "unseen"

        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Label Encoded the categories in `columns_to_encode`

        Returns:
            pd.DataFrame: label-encoded dataframe
        """
        try:
            self.encoding_dict
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )

        df_inp = df.copy()
        # sanity check to make sure all categorical columns are in an adequate
        # format
        for col in self.columns_to_encode:  # type: ignore
            df_inp[col] = df_inp[col].astype("O")

        for k, v in self.encoding_dict.items():
            df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)

        return df_inp

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Combines `fit` and `transform`

        Returns:
            pd.DataFrame: label-encoded dataframe

        Examples:
            >>> import pandas as pd
            >>> from data_preparation.label_encoder import LabelEncoder
            >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
            >>> columns_to_encode = ['col2']
            >>> encoder = LabelEncoder(columns_to_encode)
            >>> encoder.fit_transform(df)
               col1  col2
            0     1     1
            1     2     2
            2     3     3
            >>> encoder.encoding_dict
            {'col2': {'me': 1, 'you': 2, 'him': 3}}
        """
        return self.fit(df).transform(df)

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Returns the original categories

        Returns:
            pd.DataFrame: label-encoded dataframe

        Examples:
            >>> import pandas as pd
            >>> from data_preparation.label_encoder import LabelEncoder
            >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
            >>> columns_to_encode = ['col2']
            >>> encoder = LabelEncoder(columns_to_encode)
            >>> df_enc = encoder.fit_transform(df)
            >>> encoder.inverse_transform(df_enc)
               col1 col2
            0     1   me
            1     2  you
            2     3  him
        """
        for k, v in self.inverse_encoding_dict.items():
            df[k] = df[k].apply(lambda x: v[x])
        return df

`fit(df)` ¶

Creates encoding attributes

Returns:

Name	Type	Description
`LabelEncoder`	`LabelEncoder`	`LabelEncoder` fitted object

Source code in inference_model/preprocessing/label_encoder.py

def fit(self, df: pd.DataFrame) -> "LabelEncoder":
    """Creates encoding attributes

    Returns:
        LabelEncoder: `LabelEncoder` fitted object
    """

    df_inp = df.copy()

    if self.columns_to_encode is None:
        self.columns_to_encode = list(
            df_inp.select_dtypes(include=["object"]).columns
        )
    else:
        # sanity check to make sure all categorical columns are in an adequate
        # format
        for col in self.columns_to_encode:
            df_inp[col] = df_inp[col].astype("O")

    unique_column_vals = dict()
    for c in self.columns_to_encode:
        unique_column_vals[c] = df_inp[c].unique()

    self.encoding_dict = dict()

    # leave 0 for padding/"unseen" categories
    idx = 1
    for k, v in unique_column_vals.items():
        self.encoding_dict[k] = {
            o: i + idx for i, o in enumerate(unique_column_vals[k])
        }
        idx = 1

    self.inverse_encoding_dict = dict()
    for c in self.encoding_dict:
        self.inverse_encoding_dict[c] = {
            v: k for k, v in self.encoding_dict[c].items()
        }
        self.inverse_encoding_dict[c][0] = "unseen"

    return self

`fit_transform(df)` ¶

Combines fit and transform

Returns:

Type	Description
`DataFrame`	pd.DataFrame: label-encoded dataframe

Examples:

>>> import pandas as pd
>>> from data_preparation.label_encoder import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> encoder.fit_transform(df)
   col1  col2
0     1     1
1     2     2
2     3     3
>>> encoder.encoding_dict
{'col2': {'me': 1, 'you': 2, 'him': 3}}

Source code in inference_model/preprocessing/label_encoder.py

def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Combines `fit` and `transform`

    Returns:
        pd.DataFrame: label-encoded dataframe

    Examples:
        >>> import pandas as pd
        >>> from data_preparation.label_encoder import LabelEncoder
        >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
        >>> columns_to_encode = ['col2']
        >>> encoder = LabelEncoder(columns_to_encode)
        >>> encoder.fit_transform(df)
           col1  col2
        0     1     1
        1     2     2
        2     3     3
        >>> encoder.encoding_dict
        {'col2': {'me': 1, 'you': 2, 'him': 3}}
    """
    return self.fit(df).transform(df)

`inverse_transform(df)` ¶

Returns the original categories

Returns:

Type	Description
`DataFrame`	pd.DataFrame: label-encoded dataframe

Examples:

>>> import pandas as pd
>>> from data_preparation.label_encoder import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> df_enc = encoder.fit_transform(df)
>>> encoder.inverse_transform(df_enc)
   col1 col2
0     1   me
1     2  you
2     3  him

Source code in inference_model/preprocessing/label_encoder.py

def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Returns the original categories

    Returns:
        pd.DataFrame: label-encoded dataframe

    Examples:
        >>> import pandas as pd
        >>> from data_preparation.label_encoder import LabelEncoder
        >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
        >>> columns_to_encode = ['col2']
        >>> encoder = LabelEncoder(columns_to_encode)
        >>> df_enc = encoder.fit_transform(df)
        >>> encoder.inverse_transform(df_enc)
           col1 col2
        0     1   me
        1     2  you
        2     3  him
    """
    for k, v in self.inverse_encoding_dict.items():
        df[k] = df[k].apply(lambda x: v[x])
    return df

`transform(df)` ¶

Label Encoded the categories in columns_to_encode

Returns:

Type	Description
`DataFrame`	pd.DataFrame: label-encoded dataframe

Source code in inference_model/preprocessing/label_encoder.py

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Label Encoded the categories in `columns_to_encode`

    Returns:
        pd.DataFrame: label-encoded dataframe
    """
    try:
        self.encoding_dict
    except AttributeError:
        raise NotFittedError(
            "This LabelEncoder instance is not fitted yet. "
            "Call 'fit' with appropriate arguments before using this LabelEncoder."
        )

    df_inp = df.copy()
    # sanity check to make sure all categorical columns are in an adequate
    # format
    for col in self.columns_to_encode:  # type: ignore
        df_inp[col] = df_inp[col].astype("O")

    for k, v in self.encoding_dict.items():
        df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)

    return df_inp

`drop_constant_cols(df, verbose=False)` ¶

Returns dataframe without constant columns, i.e. those with just 1 unique value for all rows.

Source code in inference_model/preprocessing/preprocess_data.py

def drop_constant_cols(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """Returns dataframe without constant columns, i.e. those with just 1 unique
    value for all rows."""
    nunique_per_col = df.apply(lambda x: x.nunique(), axis=0)
    const_cols = nunique_per_col[nunique_per_col == 1].index.values
    df = df.drop(const_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} constant columns.
            Affected columns: {}
            """.format(
                len(const_cols), const_cols
            )
        )
    return df

`drop_high_nan_cols(df, threshold=0.8, verbose=False)` ¶

Returns dataframe without columns that have ratio of missingness above threshold.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`threshold`	`float = 0.8`	ratio of missingness applied per column	`0.8`
`verbose`	`bool`	whether the output should be verbose	`False`

Source code in inference_model/preprocessing/preprocess_data.py

def drop_high_nan_cols(
    df: pd.DataFrame, threshold: float = 0.8, verbose: bool = False
) -> pd.DataFrame:
    """Returns dataframe without columns that have ratio of missingness above threshold.

    Args:
        df (pd.DataFrame): input dataframe
        threshold (float = 0.8): ratio of missingness applied per column
        verbose (bool): whether the output should be verbose
    """
    n_rows = df.shape[0]
    nan_fraction_per_col = df.apply(lambda x: x.isna().sum() / n_rows, axis=0)
    high_nan_percentage_cols = nan_fraction_per_col[
        nan_fraction_per_col > threshold
    ].index.values
    df = df.drop(high_nan_percentage_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} columns with fraction of NaN above threshold = {}.
            Affected columns: {}
            """.format(
                len(high_nan_percentage_cols), threshold, high_nan_percentage_cols
            )
        )
    return df

`drop_high_uq_cat_cols(df, cat_cols, uq_val_count, verbose=False)` ¶

Returns dataframe without categorical columns that have too many unique values

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`cat_cols`	`list`	list of categorical columns	required
`uq_val_count`	`int`	unique value count	required
`verbose`	`bool`	whether the output should be verbose	`False`

Source code in inference_model/preprocessing/preprocess_data.py

def drop_high_uq_cat_cols(
    df: pd.DataFrame, cat_cols: list, uq_val_count: int, verbose: bool = False
) -> pd.DataFrame:
    """Returns dataframe without categorical columns that have too many unique values

    Args:
        df (pd.DataFrame): input dataframe
        cat_cols (list): list of categorical columns
        uq_val_count (int): unique value count
        verbose (bool): whether the output should be verbose
    """
    nunique_per_col = df[cat_cols].apply(lambda x: x.nunique(), axis=0)
    high_uq_value_cat_cols = nunique_per_col[
        nunique_per_col > uq_val_count
    ].index.values
    df = df.drop(high_uq_value_cat_cols, axis=1)
    if verbose:
        print(
            """
            Dropped {} high unique value columns.
            Affected columns: {}
            """.format(
                len(high_uq_value_cat_cols), high_uq_value_cat_cols
            )
        )
    return df

`drop_highly_correlated_columns(df, cont_cols, crosscorr_val=0.95, verbose=False)` ¶

Returns dataframe without highly correlated columns, cross correlation is evaluated with crosscorr_val.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`cont_cols`	`list`	list of columns to evaluate correlation for	required
`crosscorr_val`	`float = 0.95`	threshold value of correlation	`0.95`
`verbose`	`bool`	whether the output should be verbose	`False`

Source code in inference_model/preprocessing/preprocess_data.py

def drop_highly_correlated_columns(
    df: pd.DataFrame,
    cont_cols: list,
    crosscorr_val: float = 0.95,
    verbose: bool = False,
) -> pd.DataFrame:
    """Returns dataframe without highly correlated columns, cross correlation is
    evaluated with crosscorr_val.

    Args:
        df (pd.DataFrame): input dataframe
        cont_cols (list): list of columns to evaluate correlation for
        crosscorr_val (float = 0.95): threshold value of correlation
        verbose (bool): whether the output should be verbose
    """
    corr_matrix = df[cont_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype("bool"))
    upper_above = upper.apply(lambda x: any(x > crosscorr_val), axis=1)
    to_drop = upper_above[upper_above].index.values
    df = df.drop(to_drop, axis=1)
    if verbose:
        print(
            """
            Dropped {} highly correlated columns.
            Affected columns: {}
            """.format(
                len(to_drop), to_drop
            )
        )
    return df

`most_frequent_in_list_col(dfs)` ¶

Returns pd.Series with the most frequent values in the lists.

Parameters:

Name	Type	Description	Default
`dfs`	`Series`	input pandas series containing string list values	required

Source code in inference_model/preprocessing/preprocess_data.py

def most_frequent_in_list_col(dfs: pd.Series):
    """Returns pd.Series with the most frequent values in the lists.

    Args:
        dfs (pd.Series): input pandas series containing string list values
    """
    dfsc = dfs.copy()
    dfsc.apply(
        lambda string: _most_frequent(
            list(string.replace("[", "").replace("]", "").split(", "))
        )
        if isinstance(string, str)
        else string
    )
    return dfsc

`nan_with_number_imputer(df, columns, fill_number=-1.0, verbose=False)` ¶

Fills NAs with surrogate float, -1 is default value used, it can be customized.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`columns`	`List[str]`	list of columns that will be filled	required
`fill_number`	`float = -1`	number used to replace NAs. Defaults to	`-1.0`
`verbose`	`bool`	whether the output should be verbose	`False`

Source code in inference_model/preprocessing/preprocess_data.py

def nan_with_number_imputer(
    df: pd.DataFrame,
    columns: List[str],
    fill_number: float = -1.0,
    verbose: bool = False,
) -> pd.DataFrame:
    """Fills NAs with surrogate float, -1 is default value used, it can be customized.

    Args:
        df (pd.DataFrame): input dataframe
        columns (List[str]): list of columns that will be filled
        fill_number (float = -1): number used to replace NAs. Defaults to
        verbose (bool): whether the output should be verbose
    """
    dfc = df.copy()
    if verbose:
        sum_nan_vals = df[columns].isna().sum()
        nans_cols = []
        for c in columns:
            nans_cols.append((c, df[c].isna().sum()))
    dfc = df.copy()
    dfc[columns] = dfc[columns].apply(lambda x: x.astype(float).fillna(fill_number))
    if verbose:
        print(
            """
            Imputed {} NaN values with {}.
            Affected columns (col, num_NaNs): {}
            """.format(
                sum_nan_vals, fill_number, nans_cols
            )
        )
    return dfc

`nan_with_unknown_imputer(df, columns, fill_token='unknown', verbose=False)` ¶

Fills NAs with surrogate string, 'unknown' is default value used, it can be customized.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`columns`	`List[str]`	ist of columns that will be filled	required
`fillna_token`	`str = "unknown"`	string used to replace NAs	required
`verbose`	`bool`	whether the output should be verbose	`False`

Source code in inference_model/preprocessing/preprocess_data.py

def nan_with_unknown_imputer(
    df: pd.DataFrame,
    columns: List[str],
    fill_token: str = "unknown",
    verbose: bool = False,
) -> pd.DataFrame:
    """Fills NAs with surrogate string, 'unknown' is default value used, it can be customized.

    Args:
        df (pd.DataFrame): input dataframe
        columns (List[str]): ist of columns that will be filled
        fillna_token (str = "unknown"): string used to replace NAs
        verbose (bool): whether the output should be verbose
    """
    df = df.copy()
    if verbose:
        sum_nan_vals = df[columns].isna().sum()
        nans_cols = []
        for c in columns:
            nans_cols.append((c, df[c].isna().sum()))
    for c in columns:
        df[c] = df[c].astype(object).fillna(fill_token)

    dfc = df.copy()
    dfc[columns] = dfc[columns].apply(lambda x: x.astype(object).fillna(fill_token))
    if verbose:
        print(
            """
            Imputed {} NaN values with {}.
            Affected columns (col, num_NaNs): {}
            """.format(
                sum_nan_vals, fill_token, nans_cols
            )
        )
    return df

`nuq_in_list_col(dfs)` ¶

Returns pd.Series with the number of unique values in the lists.

Parameters:

Name	Type	Description	Default
`dfs`	`Series`	input pandas series containing string list values	required

Source code in inference_model/preprocessing/preprocess_data.py

def nuq_in_list_col(dfs: pd.Series):
    """Returns pd.Series with the number of unique values in the lists.

    Args:
        dfs (pd.Series): input pandas series containing string list values
    """
    dfsc = dfs.copy()
    dfsc.apply(
        lambda string: _nunique(
            list(string.replace("[", "").replace("]", "").split(", "))
        )
        if isinstance(string, str)
        else string
    )
    return dfsc

`replace_rare_categories_with_str_other(df, categorical_cols, quantile=0.05, surrogate_value='other', verbose=False)` ¶

Replaces rare category value with surrogate string.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`categorical_cols`	`List[str]`	list of columns in dataframe to process.	required
`quantile`	`float = 0.05`	determines what values are considered as rare	`0.05`
`surrogate_value`	`str = "other"`	string used to replace rare values	`'other'`

Returns:

Type	Description
`Tuple[DataFrame, Dict]`	Tuple[pd.DataFrame, Dict]: New dataframe and a dict. with mapping between orig. and surrogate values.

Source code in inference_model/preprocessing/preprocess_data.py

def replace_rare_categories_with_str_other(
    df: pd.DataFrame,
    categorical_cols: List[str],
    quantile: float = 0.05,
    surrogate_value: str = "other",
    verbose: bool = False,
) -> Tuple[pd.DataFrame, Dict]:
    """Replaces rare category value with surrogate string.

    Args:
        df (pd.DataFrame): input dataframe
        categorical_cols (List[str]): list of columns in dataframe to process.
        quantile (float = 0.05): determines what values are considered as rare
        surrogate_value (str = "other"): string used to replace rare values

    Returns:
        Tuple[pd.DataFrame, Dict]:
            New dataframe and a dict. with mapping between orig. and surrogate values.
    """
    if surrogate_value in df[categorical_cols].values:
        raise ValueError(
            "Surrogate string - "
            + surrogate_value
            + " - is already present, choose another one."
        )

    dfc = df.copy()
    replace_dict_per_col = (
        df[categorical_cols]
        .apply(_replace_rare_dict, args=[surrogate_value, quantile])
        .to_dict()
    )
    dfc = dfc.replace(replace_dict_per_col)
    if verbose:
        sum_rare_cats = sum(
            [len(replace_dict_per_col[col]) for col in replace_dict_per_col]
        )
        rare_cats_per_col = [
            (col, list(replace_dict_per_col[col].keys()))
            for col in replace_dict_per_col
        ]
        print(
            """
            Replaced {} rare categories (val_count < {}) with {}.
            Affected columns (List[(col, rare_cats)]): {}
            """.format(
                sum_rare_cats,
                quantile,
                surrogate_value,
                rare_cats_per_col,
            )
        )
    return dfc, replace_dict_per_col

`PreprocessData` ¶

Object to preprocess the dataset. Args: target_col (str): target column name id_cols (List[str]): id columns cat_cols (Optional[List[str]]): list of categorical column names cont_cols (Optional[List[str]]): list of continuous column names

Source code in inference_model/preprocessing/preprocess.py

class PreprocessData:
    """Object to preprocess the dataset.
    Args:
        target_col (str): target column name
        id_cols (List[str]): id columns
        cat_cols (Optional[List[str]]): list of categorical column names
        cont_cols (Optional[List[str]]): list of continuous column names
    """

    def __init__(
        self,
        target_col: str,
        id_cols: str,
        cat_cols: Optional[List[str]] = None,
        cont_cols: Optional[List[str]] = None,
    ):
        self.target_col = target_col
        self.id_cols = id_cols
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols

        self.is_fitted = False

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fit peprocessor and transform dataset in training step."""

        dfc = df.drop(columns=self.id_cols).copy()

        dfc = dfc.pipe(drop_constant_cols)

        self.cat_cols = intsec(list(dfc), self.cat_cols)

        if self.cat_cols is None:
            self.cat_cols = self._infere_cat_cols(dfc)
            dfc_cat_cols = drop_high_nan_cols(dfc[self.cat_cols])
            dfc = dfc.drop(columns=self.cat_cols)
            dfc = pd.concat([dfc, dfc_cat_cols], axis=1)
            self.cat_cols = dfc_cat_cols.columns.values.tolist()

        if self.cont_cols is None:
            self.init_cont_cols = self._infere_cont_cols(dfc, self.cat_cols)
            dfc_init_cont_cols = drop_high_nan_cols(dfc[self.init_cont_cols])
            dfc = dfc.drop(columns=self.init_cont_cols)
            dfc = pd.concat([dfc, dfc_init_cont_cols], axis=1)
            self.init_cont_cols = dfc_init_cont_cols.columns.values.tolist()

            dfc = nan_with_number_imputer(dfc, self.init_cont_cols, -9999.0)

            (
                dfc,
                self.final_cont_cols,
            ) = self._drop_highly_corr_cols_and_get_final_cont_cols(
                dfc, self.init_cont_cols
            )

        else:
            self.final_cont_cols = list(
                set(self.cont_cols).intersection(set(dfc.columns.values))
            )

            dfc = nan_with_number_imputer(dfc, self.final_cont_cols, -9999.0)

        dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

        dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]

        self.label_encoder = LabelEncoder(self.cat_cols)
        dfc_le = self.label_encoder.fit_transform(dfc)

        dfc_le = self._change_int_float_types(dfc_le)

        dfc_le[self.id_cols] = df[self.id_cols]

        self.is_fitted = True

        return dfc_le

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform dataset in inference step."""

        if not self.is_fitted:
            raise NotFittedError(
                """This instance of 'PreprocessData' has not been fitted yet.
                Please, run 'fit' first"""
            )

        dfc = df.drop(columns=self.id_cols).copy()

        # added as mlflow inference is receiving all the data as objects
        dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
        dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)

        try:
            dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
        except KeyError:
            dfc = dfc[self.cat_cols + self.final_cont_cols]

        # dfc = dfc.replace(self.replace_rare_categories_dict)

        dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

        dfc = nan_with_number_imputer(
            dfc,
            list(set(self.final_cont_cols)),
            -9999.0,  # noqa
        )

        dfc_le = self.label_encoder.transform(dfc)

        dfc_le = self._change_int_float_types(dfc_le)

        dfc_le[self.id_cols] = df[self.id_cols]

        return dfc_le

    def fit(self, df: pd.DataFrame) -> pd.DataFrame:
        """Just to keep familiar naming convention with sklearn."""
        return self.fit_transform(df)

    def _drop_highly_corr_cols_and_get_final_cont_cols(
        self, df: pd.DataFrame, cont_cols: List[str]
    ) -> Tuple[pd.DataFrame, List[str]]:
        """Drop highly correlated columns to decrease the size of the dataset
        by dropping redundant information."""

        df = drop_highly_correlated_columns(df, cont_cols)

        final_cont_cols = [
            c for c in df.columns if c not in self.cat_cols + [self.target_col]
        ]

        return df, final_cont_cols

    def _infere_cat_cols(self, df: pd.DataFrame):
        """Guess the categorical columns by excluding clearly continuous
        columns - int, float"""

        cat_cols = []
        for col in df.columns:
            if (
                df[col].dtype not in ["int32", "int64"]
                and df[col].dtype not in ["float32", "float64"]
                and col not in [self.target_col]
            ):
                cat_cols.append(col)
        return cat_cols

    def _infere_cont_cols(
        self, df: pd.DataFrame, cat_cols: Optional[List[str]]
    ):  # noqa
        """Guess the continuous columns by excluding clearly categorical,
        and target_col and including only int or float."""

        if cat_cols is not None:
            cont_cols = [c for c in df.columns if c not in cat_cols + [self.target_col]]
        else:
            cont_cols = []
            for col in df.columns:
                if (df[col].dtype == "int" or df[col].dtype == "float") and col not in [
                    self.target_col
                ]:
                    cont_cols.append(col)

        return cont_cols

    def _change_int_float_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Change int to float as int causes data type issues in some ml
        methods, eg. lightgbm."""
        dfc = df.copy()
        dfc = dfc.astype(
            dict.fromkeys(dfc.select_dtypes(np.int64).columns, np.int32)
        )  # noqa
        dfc = dfc.astype(
            dict.fromkeys(dfc.select_dtypes(np.float64).columns, np.float32)
        )

        return dfc

`fit(df)` ¶

Just to keep familiar naming convention with sklearn.

Source code in inference_model/preprocessing/preprocess.py

def fit(self, df: pd.DataFrame) -> pd.DataFrame:
    """Just to keep familiar naming convention with sklearn."""
    return self.fit_transform(df)

`fit_transform(df)` ¶

Fit peprocessor and transform dataset in training step.

Source code in inference_model/preprocessing/preprocess.py

def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Fit peprocessor and transform dataset in training step."""

    dfc = df.drop(columns=self.id_cols).copy()

    dfc = dfc.pipe(drop_constant_cols)

    self.cat_cols = intsec(list(dfc), self.cat_cols)

    if self.cat_cols is None:
        self.cat_cols = self._infere_cat_cols(dfc)
        dfc_cat_cols = drop_high_nan_cols(dfc[self.cat_cols])
        dfc = dfc.drop(columns=self.cat_cols)
        dfc = pd.concat([dfc, dfc_cat_cols], axis=1)
        self.cat_cols = dfc_cat_cols.columns.values.tolist()

    if self.cont_cols is None:
        self.init_cont_cols = self._infere_cont_cols(dfc, self.cat_cols)
        dfc_init_cont_cols = drop_high_nan_cols(dfc[self.init_cont_cols])
        dfc = dfc.drop(columns=self.init_cont_cols)
        dfc = pd.concat([dfc, dfc_init_cont_cols], axis=1)
        self.init_cont_cols = dfc_init_cont_cols.columns.values.tolist()

        dfc = nan_with_number_imputer(dfc, self.init_cont_cols, -9999.0)

        (
            dfc,
            self.final_cont_cols,
        ) = self._drop_highly_corr_cols_and_get_final_cont_cols(
            dfc, self.init_cont_cols
        )

    else:
        self.final_cont_cols = list(
            set(self.cont_cols).intersection(set(dfc.columns.values))
        )

        dfc = nan_with_number_imputer(dfc, self.final_cont_cols, -9999.0)

    dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

    dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]

    self.label_encoder = LabelEncoder(self.cat_cols)
    dfc_le = self.label_encoder.fit_transform(dfc)

    dfc_le = self._change_int_float_types(dfc_le)

    dfc_le[self.id_cols] = df[self.id_cols]

    self.is_fitted = True

    return dfc_le

`transform(df)` ¶

Transform dataset in inference step.

Source code in inference_model/preprocessing/preprocess.py

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Transform dataset in inference step."""

    if not self.is_fitted:
        raise NotFittedError(
            """This instance of 'PreprocessData' has not been fitted yet.
            Please, run 'fit' first"""
        )

    dfc = df.drop(columns=self.id_cols).copy()

    # added as mlflow inference is receiving all the data as objects
    dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
    dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)

    try:
        dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
    except KeyError:
        dfc = dfc[self.cat_cols + self.final_cont_cols]

    # dfc = dfc.replace(self.replace_rare_categories_dict)

    dfc = nan_with_unknown_imputer(dfc, self.cat_cols)

    dfc = nan_with_number_imputer(
        dfc,
        list(set(self.final_cont_cols)),
        -9999.0,  # noqa
    )

    dfc_le = self.label_encoder.transform(dfc)

    dfc_le = self._change_int_float_types(dfc_le)

    dfc_le[self.id_cols] = df[self.id_cols]

    return dfc_le

`scaler_mapper(cont_cols, cat_cols, id_cols, scaler_mapper_def=None)` ¶

Function that maps scaler functions to appropriate columns.

By default does not assign any scaler to continuous, categorical or identifier columns. The scalers must be set in scaler_mapper_def. Use sklearn scalers. Only columns defined in mapper object will be present in the transformed dataset.

Parameters:

Name	Type	Description	Default
`cont_cols`	`list`	list of continuous feature columns in the dataset	required
`cat_cols`	`list`	list of categorical feature columns in the dataset	required
`id_cols`	`list`	identifier columns	required
`scaler_mapper_def`	`dict`	optional dictionary that contains keys ['cont_cols', 'cat_cols', 'id_cols'] with their corresponding scalers (defined by names, not instantiated) from sklearn library	`None`

Source code in inference_model/preprocessing/scaler.py

def scaler_mapper(
    cont_cols: List[str],
    cat_cols: List[str],
    id_cols: List[str],
    scaler_mapper_def: Optional[dict] = None,
) -> DataFrameMapper:
    """Function that maps scaler functions to appropriate columns.

    By default does not assign any scaler to continuous, categorical or
    identifier columns. The scalers must be set in scaler_mapper_def. Use sklearn scalers.
    Only columns defined in mapper object will be present in the transformed dataset.

    Args:
        cont_cols (list): list of continuous feature columns in the dataset
        cat_cols (list): list of categorical feature columns in the dataset
        id_cols (list): identifier columns
        scaler_mapper_def (dict): optional dictionary that contains keys
            ['cont_cols', 'cat_cols', 'id_cols'] with their corresponding
            scalers (defined by names, not instantiated) from sklearn library
    Returns:
        scaler (DataFrameMapper): scaler object mapping sklearn scalers to columns in
            pandas dataframe
    """
    if scaler_mapper_def:
        cont_cols_def = gen_features(
            columns=list(map(lambda x: [x], cont_cols)),
            classes=[scaler_mapper_def["cont_cols"]],
        )

        cat_cols_def = gen_features(
            columns=list(map(lambda x: [x], cat_cols)),
            classes=[scaler_mapper_def["cat_cols"]],
        )

        id_cols_def = gen_features(
            columns=list(map(lambda x: [x], id_cols)),
            classes=[scaler_mapper_def["id_cols"]],
        )

    else:
        cont_cols_def = gen_features(
            columns=list(map(lambda x: [x], cont_cols)), classes=[None]
        )

        cat_cols_def = gen_features(
            columns=list(map(lambda x: [x], cat_cols)), classes=[None]
        )

        id_cols_def = gen_features(
            columns=list(map(lambda x: [x], id_cols)), classes=[None]
        )

    scaler = DataFrameMapper(cont_cols_def + cat_cols_def + id_cols_def, df_out=True)
    return scaler

Preprocess data¶

LabelEncoder ¶

fit(df) ¶

fit_transform(df) ¶

inverse_transform(df) ¶

transform(df) ¶

drop_constant_cols(df, verbose=False) ¶

drop_high_nan_cols(df, threshold=0.8, verbose=False) ¶

drop_high_uq_cat_cols(df, cat_cols, uq_val_count, verbose=False) ¶

drop_highly_correlated_columns(df, cont_cols, crosscorr_val=0.95, verbose=False) ¶

most_frequent_in_list_col(dfs) ¶

nan_with_number_imputer(df, columns, fill_number=-1.0, verbose=False) ¶

nan_with_unknown_imputer(df, columns, fill_token='unknown', verbose=False) ¶

nuq_in_list_col(dfs) ¶

replace_rare_categories_with_str_other(df, categorical_cols, quantile=0.05, surrogate_value='other', verbose=False) ¶

PreprocessData ¶

fit(df) ¶

fit_transform(df) ¶

transform(df) ¶

scaler_mapper(cont_cols, cat_cols, id_cols, scaler_mapper_def=None) ¶

`LabelEncoder` ¶

`fit(df)` ¶

`fit_transform(df)` ¶

`inverse_transform(df)` ¶

`transform(df)` ¶

`drop_constant_cols(df, verbose=False)` ¶

`drop_high_nan_cols(df, threshold=0.8, verbose=False)` ¶

`drop_high_uq_cat_cols(df, cat_cols, uq_val_count, verbose=False)` ¶

`drop_highly_correlated_columns(df, cont_cols, crosscorr_val=0.95, verbose=False)` ¶

`most_frequent_in_list_col(dfs)` ¶

`nan_with_number_imputer(df, columns, fill_number=-1.0, verbose=False)` ¶

`nan_with_unknown_imputer(df, columns, fill_token='unknown', verbose=False)` ¶

`nuq_in_list_col(dfs)` ¶

`replace_rare_categories_with_str_other(df, categorical_cols, quantile=0.05, surrogate_value='other', verbose=False)` ¶

`PreprocessData` ¶

`fit(df)` ¶

`fit_transform(df)` ¶

`transform(df)` ¶

`scaler_mapper(cont_cols, cat_cols, id_cols, scaler_mapper_def=None)` ¶