Skip to content

Training

aiqc(actual, high_quantile, low_quantile)

Average InterQuantile Coverage for Quantile Regression. Check if the interquantile coverage is close to the coverage of in the test dataset.

Parameters:

Name Type Description Default
actual ndarray

actual values

required
high_quantile_predicted ndarray

high quantile predicted values

required
low_quantile_predicted ndarray

low quantile predicted values

required
Source code in inference_model/training/metrics.py
77
78
79
80
81
82
83
84
85
86
87
88
def aiqc(actual: np.ndarray, high_quantile: np.ndarray, low_quantile: np.ndarray):
    """Average InterQuantile Coverage for Quantile Regression.
    Check if the interquantile coverage is close to the coverage of in the test dataset.

    Args:
        actual (np.ndarray): actual values
        high_quantile_predicted (np.ndarray): high quantile predicted values
        low_quantile_predicted (np.ndarray): low quantile predicted values
    Returns:
        aiqc (float): average interquantile coverage
    """
    return np.sum((actual < high_quantile) & (actual > low_quantile)) / len(actual)

nacil(actual, high_quantile_predicted, low_quantile_predicted)

Normalized Average Confidence Interval Length for Quantile Regression. The value is normalized to actual(expected) value.

Parameters:

Name Type Description Default
actual ndarray

actual values

required
high_quantile_predicted ndarray

high quantile predicted values

required
low_quantile_predicted ndarray

low quantile predicted values

required
Source code in inference_model/training/metrics.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def nacil(
    actual: np.ndarray,
    high_quantile_predicted: np.ndarray,
    low_quantile_predicted: np.ndarray,
):
    """Normalized Average Confidence Interval Length for Quantile Regression.
    The value is normalized to actual(expected) value.

    Args:
        actual (np.ndarray): actual values
        high_quantile_predicted (np.ndarray): high quantile predicted values
        low_quantile_predicted (np.ndarray): low quantile predicted values
    Returns:
        nacil (float): normalized average confidence interval length
    """
    return np.mean((high_quantile_predicted - low_quantile_predicted) / actual)

ndcg(actual, predicted)

Normalized Discounted Cumulative Gain

Parameters:

Name Type Description Default
actual ndarray

actual values

required
predicted ndarray

predicted values

required
Source code in inference_model/training/metrics.py
32
33
34
35
36
37
38
39
40
41
42
43
44
def ndcg(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Normalized Discounted Cumulative Gain

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Return:
        ndcg (float): normalized discounted cumulative gain
    """
    return ndcg_score(
        [rankdata(actual, method="ordinal")],
        [rankdata(predicted, method="ordinal")],
    )

rmse(actual, predicted)

Root Mean Squared Error

Parameters:

Name Type Description Default
actual ndarray

actual values

required
predicted ndarray

predicted values

required
Source code in inference_model/training/metrics.py
47
48
49
50
51
52
53
54
55
56
def rmse(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Root Mean Squared Error

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Returns:
        rmse (float): root mean square error
    """
    return mean_squared_error(actual, predicted, squared=False)

smape(actual, predicted)

Symmetric Mean Absolute Percentage Error https://vedexcel.com/how-to-calculate-smape-in-python/

Parameters:

Name Type Description Default
actual ndarray

actual values

required
predicted ndarray

predicted values

required
Source code in inference_model/training/metrics.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def smape(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Symmetric Mean Absolute Percentage Error
    https://vedexcel.com/how-to-calculate-smape-in-python/

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Returns:
        smape (float): symmetric mean absolute percentage error
    """
    return (
        100
        / len(actual)
        * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))
    )

LGBOptunaOptimizer

Bases: BaseOptimizer

Source code in inference_model/training/optuna_optimizer.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class LGBOptunaOptimizer(BaseOptimizer):
    def __init__(
        self,
        objective: Literal["binary", "multiclass", "regression"],
        n_class: Optional[int] = None,
    ):
        """Fallback/backup Optuna optimizer. Development is focused on Raytune.
        Kepping this code as backup.

        Args:
            objective (str): objective of the model
            n_class (int): number of classes in the dataset
        """
        # Optuna does not support original lighgbm implemenattion and with a workaroud it
        # is possible to get the binarry classfier with focal_loss(any custom loss)
        # running but the multiclass requires changes to Optuna code, eg. this post:
        # https://lightrun.com/answers/optuna-optuna-error-when-using-custom-metrics-in-optunaintegrationlightgbm  # noqa
        # loss = None
        super(LGBOptunaOptimizer, self).__init__(objective, n_class)
        self.params = self.base_params

    def optimize(self, dtrain: lgbDataset, deval: lgbDataset):
        """Optimize LGBM model on provided datasets.

        Args:
            dtrain (lgbDataset): training lgb dataset
            deval (lgbDataset): evaluation lgb dataset
        """
        dtrain_copy = deepcopy(dtrain)
        deval_copy = deepcopy(deval) if deval is not None else None
        study = create_study(study_name="LightGBMTuner")

        tuner = LightGBMTuner(
            params=self.params,
            train_set=dtrain_copy,
            valid_sets=deval_copy,
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(stopping_rounds=50)],
            feval=self.feval,
            study=study,
            # fobj=self.fobj,
        )
        tuner.run()

        self.study = study
        self.best = tuner.best_params
        # since n_estimators is not among the params that Optuna optimizes we
        # need to add it manually. We add a high value since it will be used
        # with early_stopping_rounds
        self.best["n_estimators"] = 1000  # type: ignore

__init__(objective, n_class=None)

Fallback/backup Optuna optimizer. Development is focused on Raytune. Kepping this code as backup.

Parameters:

Name Type Description Default
objective str

objective of the model

required
n_class int

number of classes in the dataset

None
Source code in inference_model/training/optuna_optimizer.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(
    self,
    objective: Literal["binary", "multiclass", "regression"],
    n_class: Optional[int] = None,
):
    """Fallback/backup Optuna optimizer. Development is focused on Raytune.
    Kepping this code as backup.

    Args:
        objective (str): objective of the model
        n_class (int): number of classes in the dataset
    """
    # Optuna does not support original lighgbm implemenattion and with a workaroud it
    # is possible to get the binarry classfier with focal_loss(any custom loss)
    # running but the multiclass requires changes to Optuna code, eg. this post:
    # https://lightrun.com/answers/optuna-optuna-error-when-using-custom-metrics-in-optunaintegrationlightgbm  # noqa
    # loss = None
    super(LGBOptunaOptimizer, self).__init__(objective, n_class)
    self.params = self.base_params

optimize(dtrain, deval)

Optimize LGBM model on provided datasets.

Parameters:

Name Type Description Default
dtrain Dataset

training lgb dataset

required
deval Dataset

evaluation lgb dataset

required
Source code in inference_model/training/optuna_optimizer.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def optimize(self, dtrain: lgbDataset, deval: lgbDataset):
    """Optimize LGBM model on provided datasets.

    Args:
        dtrain (lgbDataset): training lgb dataset
        deval (lgbDataset): evaluation lgb dataset
    """
    dtrain_copy = deepcopy(dtrain)
    deval_copy = deepcopy(deval) if deval is not None else None
    study = create_study(study_name="LightGBMTuner")

    tuner = LightGBMTuner(
        params=self.params,
        train_set=dtrain_copy,
        valid_sets=deval_copy,
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
        feval=self.feval,
        study=study,
        # fobj=self.fobj,
    )
    tuner.run()

    self.study = study
    self.best = tuner.best_params
    # since n_estimators is not among the params that Optuna optimizes we
    # need to add it manually. We add a high value since it will be used
    # with early_stopping_rounds
    self.best["n_estimators"] = 1000  # type: ignore

Trainer

Bases: BaseTrainer

Source code in inference_model/training/trainer.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class Trainer(BaseTrainer):
    def __init__(
        self,
        cat_cols: List[str],
        target_col: str,
        id_cols: List[str],
        objective: Literal["binary"],
        optimizer: Union[Any, BaseOptimizer, None] = None,
        n_class: Optional[int] = None,
        preprocessors: Optional[List[Union[Any, PreprocessData]]] = None,
    ):
        """Objects that governs training and parameter optimization of the lgbm model.

        Args:
            cat_cols (list): list of categorical feature column names
            target_col (str): column name that represents target
            id_cols (list): identification column names
            objective (str): type of the task/objective
            optimizer (BaseOptimizer): parameter optimizer object
            n_class (int): number of classes in the dataset
            preprocessors (List[Union[Any, PreprocessData]]):
                ordered list of objects to preprocess dataset before optimization
                and training
        """
        super(Trainer, self).__init__(
            cat_cols=cat_cols,
            target_col=target_col,
            id_cols=id_cols,
            objective=objective,
            n_class=n_class,
            preprocessors=preprocessors,
        )
        if optimizer is not None:
            if not hasattr(optimizer, "optimize"):
                raise AttributeError(
                    "{} optimizer must have {} method".format(optimizer, "optimize")
                )
        self.optimizer = optimizer

    def train(
        self,
        df_train: pd.DataFrame,
        params: Optional[Dict] = None,
        df_valid: Optional[pd.DataFrame] = None,
    ):
        """Train the model with the parameters.

        Args:
            df_train (pd.DataFrame): training dataset
            params (dict): model paramaters
            df_valid (pd.DataFrame): optional validation dataset
        Returns:
            model (lgb.basic.Booster): trained model
        """
        if self.preprocessors:
            for prep in self.preprocessors:
                df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
                df_valid_prep = (
                    prep.transform(df_valid.drop(columns=[self.target_col]))
                    if df_valid is not None
                    else None
                )
            if self.objective in ["binary"]:
                df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
                if df_valid is not None:
                    df_valid_prep[self.target_col] = df_valid[self.target_col].astype(
                        int
                    )
            else:
                raise NotImplementedError
        else:
            df_train_prep = df_train.copy()
            df_valid_prep = df_valid.copy() if df_valid is not None else None

        cat_cols = self.cat_cols
        if params:
            config = params
        elif (params is None) and (self.optimizer):
            config = self.optimizer.best
            if hasattr(self.optimizer, "best_to_drop"):
                df_train_prep.drop(
                    columns=self.optimizer.best_to_drop,  # type: ignore
                    inplace=True,
                )
                if df_valid is not None:
                    df_valid_prep.drop(
                        columns=self.optimizer.best_to_drop,  # type: ignore
                        inplace=True,
                    )
                cat_cols = intsec(self.cat_cols, df_train_prep.columns.values)

        elif (params is None) and (self.optimizer is None):
            config = self.base_params

        lgb_train, lgb_valid = to_lgbdataset(
            train=df_train_prep,
            cat_cols=cat_cols,
            target_col=self.target_col,
            id_cols=self.id_cols,
            valid=df_valid_prep,
        )

        self.model = lgb_train_function(
            config=config,
            lgbtrain=lgb_train,
            lgbeval=lgb_valid,
            feval=self.feval,
        )

    def fit(
        self,
        df_train: pd.DataFrame,
        df_valid: pd.DataFrame,
        df_test: pd.DataFrame,
        random_state: int = 1,
    ) -> pd.DataFrame:
        """Train the model and optimize the parameters.

        Args:
            df_train (pd.DataFrame): training dataset
            df_valid (pd.DataFrame): validation dataset
            df_test (pd.DataFrame): testing dataset
        Returns:
            model (lgb.basic.Booster): trained mdoel
        """
        if self.preprocessors:
            # this should be
            # df_train_prep = prep.transform(df_train_prep.drop(columns=[self.target_col]))
            for prep in self.preprocessors:
                df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
                df_valid_prep = prep.transform(df_valid.drop(columns=[self.target_col]))
            if self.objective in ["binary"]:
                df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
                df_valid_prep[self.target_col] = df_valid[self.target_col].astype(int)
            else:
                raise NotImplementedError
        else:
            df_train_prep = df_train.copy()
            df_valid_prep = df_valid.copy()

        lgb_train, lgb_valid = to_lgbdataset(
            train=df_train_prep,
            cat_cols=self.cat_cols,
            target_col=self.target_col,
            id_cols=self.id_cols,
            valid=df_valid_prep,
        )
        self.optimizer.optimize(dtrain=lgb_train, deval=lgb_valid)

        df_train_valid = pd.concat([df_train, df_valid], ignore_index=True)
        df_train_valid = df_train_valid.reset_index(drop=True)
        self.train(df_train=df_train_valid)
        metrics_dict = self.compute_metrics(df_test)
        return metrics_dict

__init__(cat_cols, target_col, id_cols, objective, optimizer=None, n_class=None, preprocessors=None)

Objects that governs training and parameter optimization of the lgbm model.

Parameters:

Name Type Description Default
cat_cols list

list of categorical feature column names

required
target_col str

column name that represents target

required
id_cols list

identification column names

required
objective str

type of the task/objective

required
optimizer BaseOptimizer

parameter optimizer object

None
n_class int

number of classes in the dataset

None
preprocessors List[Union[Any, PreprocessData]]

ordered list of objects to preprocess dataset before optimization and training

None
Source code in inference_model/training/trainer.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    cat_cols: List[str],
    target_col: str,
    id_cols: List[str],
    objective: Literal["binary"],
    optimizer: Union[Any, BaseOptimizer, None] = None,
    n_class: Optional[int] = None,
    preprocessors: Optional[List[Union[Any, PreprocessData]]] = None,
):
    """Objects that governs training and parameter optimization of the lgbm model.

    Args:
        cat_cols (list): list of categorical feature column names
        target_col (str): column name that represents target
        id_cols (list): identification column names
        objective (str): type of the task/objective
        optimizer (BaseOptimizer): parameter optimizer object
        n_class (int): number of classes in the dataset
        preprocessors (List[Union[Any, PreprocessData]]):
            ordered list of objects to preprocess dataset before optimization
            and training
    """
    super(Trainer, self).__init__(
        cat_cols=cat_cols,
        target_col=target_col,
        id_cols=id_cols,
        objective=objective,
        n_class=n_class,
        preprocessors=preprocessors,
    )
    if optimizer is not None:
        if not hasattr(optimizer, "optimize"):
            raise AttributeError(
                "{} optimizer must have {} method".format(optimizer, "optimize")
            )
    self.optimizer = optimizer

fit(df_train, df_valid, df_test, random_state=1)

Train the model and optimize the parameters.

Parameters:

Name Type Description Default
df_train DataFrame

training dataset

required
df_valid DataFrame

validation dataset

required
df_test DataFrame

testing dataset

required
Source code in inference_model/training/trainer.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def fit(
    self,
    df_train: pd.DataFrame,
    df_valid: pd.DataFrame,
    df_test: pd.DataFrame,
    random_state: int = 1,
) -> pd.DataFrame:
    """Train the model and optimize the parameters.

    Args:
        df_train (pd.DataFrame): training dataset
        df_valid (pd.DataFrame): validation dataset
        df_test (pd.DataFrame): testing dataset
    Returns:
        model (lgb.basic.Booster): trained mdoel
    """
    if self.preprocessors:
        # this should be
        # df_train_prep = prep.transform(df_train_prep.drop(columns=[self.target_col]))
        for prep in self.preprocessors:
            df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
            df_valid_prep = prep.transform(df_valid.drop(columns=[self.target_col]))
        if self.objective in ["binary"]:
            df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
            df_valid_prep[self.target_col] = df_valid[self.target_col].astype(int)
        else:
            raise NotImplementedError
    else:
        df_train_prep = df_train.copy()
        df_valid_prep = df_valid.copy()

    lgb_train, lgb_valid = to_lgbdataset(
        train=df_train_prep,
        cat_cols=self.cat_cols,
        target_col=self.target_col,
        id_cols=self.id_cols,
        valid=df_valid_prep,
    )
    self.optimizer.optimize(dtrain=lgb_train, deval=lgb_valid)

    df_train_valid = pd.concat([df_train, df_valid], ignore_index=True)
    df_train_valid = df_train_valid.reset_index(drop=True)
    self.train(df_train=df_train_valid)
    metrics_dict = self.compute_metrics(df_test)
    return metrics_dict

train(df_train, params=None, df_valid=None)

Train the model with the parameters.

Parameters:

Name Type Description Default
df_train DataFrame

training dataset

required
params dict

model paramaters

None
df_valid DataFrame

optional validation dataset

None
Source code in inference_model/training/trainer.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def train(
    self,
    df_train: pd.DataFrame,
    params: Optional[Dict] = None,
    df_valid: Optional[pd.DataFrame] = None,
):
    """Train the model with the parameters.

    Args:
        df_train (pd.DataFrame): training dataset
        params (dict): model paramaters
        df_valid (pd.DataFrame): optional validation dataset
    Returns:
        model (lgb.basic.Booster): trained model
    """
    if self.preprocessors:
        for prep in self.preprocessors:
            df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
            df_valid_prep = (
                prep.transform(df_valid.drop(columns=[self.target_col]))
                if df_valid is not None
                else None
            )
        if self.objective in ["binary"]:
            df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
            if df_valid is not None:
                df_valid_prep[self.target_col] = df_valid[self.target_col].astype(
                    int
                )
        else:
            raise NotImplementedError
    else:
        df_train_prep = df_train.copy()
        df_valid_prep = df_valid.copy() if df_valid is not None else None

    cat_cols = self.cat_cols
    if params:
        config = params
    elif (params is None) and (self.optimizer):
        config = self.optimizer.best
        if hasattr(self.optimizer, "best_to_drop"):
            df_train_prep.drop(
                columns=self.optimizer.best_to_drop,  # type: ignore
                inplace=True,
            )
            if df_valid is not None:
                df_valid_prep.drop(
                    columns=self.optimizer.best_to_drop,  # type: ignore
                    inplace=True,
                )
            cat_cols = intsec(self.cat_cols, df_train_prep.columns.values)

    elif (params is None) and (self.optimizer is None):
        config = self.base_params

    lgb_train, lgb_valid = to_lgbdataset(
        train=df_train_prep,
        cat_cols=cat_cols,
        target_col=self.target_col,
        id_cols=self.id_cols,
        valid=df_valid_prep,
    )

    self.model = lgb_train_function(
        config=config,
        lgbtrain=lgb_train,
        lgbeval=lgb_valid,
        feval=self.feval,
    )

flatten_dict(d, parent_key='', sep='_')

fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/

Source code in inference_model/training/utils.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def flatten_dict(
    d: MutableMapping, parent_key: str = "", sep: str = "_"
) -> MutableMapping:
    """
    fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
    """
    items: List[Any] = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

get_feature_importance(model)

Extract model feature importances and return sorted dataframe.

Parameters:

Name Type Description Default
model Booster

LightGBM model

required

Returns:

Name Type Description
feature_imp DataFrame

sorted dataframe with features and their importances

Source code in inference_model/training/utils.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def get_feature_importance(model: lgb.basic.Booster) -> pd.DataFrame:
    """Extract model feature importances and return sorted dataframe.

    Args:
        model (lgb.basic.Booster): LightGBM model

    Returns:
        feature_imp (pd.DataFrame): sorted dataframe with features and their
            importances
    """
    feature_imp = pd.DataFrame(
        {"value": model.feature_importance(), "feature": model.feature_name()}
    )
    feature_imp.sort_values(by="value", inplace=True)
    feature_imp.reset_index(drop=True, inplace=True)
    return feature_imp

get_or_create_experiment(experiment_name)

Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

This function checks if an experiment with the given name exists within MLflow. If it does, the function returns its ID. If not, it creates a new experiment with the provided name and returns its ID.

Parameters: - experiment_name (str): Name of the MLflow experiment.

Returns: - str: ID of the existing or newly created MLflow experiment.

Source code in inference_model/training/utils.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

predict_cls_lgbm_from_raw(preds_raw, task)

Helper function to convert raw margin predictions through a sigmoid to represent a probability.

Parameters:

Name Type Description Default
preds_raw ndarray

predictions

required
lgbDataset Dataset

dataset, containing labels, used for prediction

required
Source code in inference_model/training/utils.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def predict_cls_lgbm_from_raw(
    preds_raw: np.ndarray, task: Literal["binary", "multiclass"]
) -> np.ndarray:
    """Helper function to convert raw margin predictions through a
    sigmoid to represent a probability.

    Args:
        preds_raw (ndarray):
            predictions
        lgbDataset (lightgbm.Dataset):
            dataset, containing labels, used for prediction
    Returns:
        (y_true, preds):
            tuple containg labels and predictions for further evaluation
    """
    predicted_probs = predict_proba_lgbm_from_raw(preds_raw=preds_raw, task=task)
    if task == "binary":
        pred_cls = np.array([int(p > 0.5) for p in predicted_probs])
    elif task == "multiclass":
        pred_cls = predicted_probs.argmax(axis=1)

    return pred_cls

predict_proba_lgbm_from_raw(preds_raw, task, binary2d=False)

Apply softmax to array of arrays that is an output of lightgbm.predct(). This replaces predict_proba().

Parameters:

Name Type Description Default
predicted_raw ndarray

1D numpy array of arrays

required
task str

type of task/objective

required
binary2d boolean

wether the output of binary classification should be 1 or 2d vector

False
Source code in inference_model/training/utils.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def predict_proba_lgbm_from_raw(
    preds_raw: np.ndarray,
    task: Literal["binary", "multiclass"],
    binary2d: Optional[bool] = False,
) -> np.ndarray:
    """Apply softmax to array of arrays that is an output of lightgbm.predct().
    This replaces predict_proba().

    Args:
        predicted_raw (ndarray):
            1D numpy array of arrays
        task (str):
            type of task/objective
        binary2d (boolean):
            wether the output of binary classification should be 1 or 2d vector
    Returns:
        predicted_probs (ndarray): array with predicted probabilities
    """
    if task == "binary":
        predicted_probs = _sigmoid(preds_raw)
        if binary2d:
            predicted_probs = np.apply_along_axis(
                _binary_margin_to_prob, 1, np.vstack(tuple(predicted_probs))
            )
    elif task == "multiclass":
        predicted_probs = np.apply_along_axis(_softmax, 1, np.stack(tuple(preds_raw)))
    return predicted_probs

to_lgbdataset(train, cat_cols, target_col, id_cols=[], valid=None)

Transform pandas dataframe to lgbm dataset, or datasets(eval).

Parameters:

Name Type Description Default
train DataFrame

training dataset

required
cat_cols list

list of categorical columns

required
target_col str

target column in the dataset

required
id_cols list

list of identifier columns

[]
valid DataFrame

validation dataset

None
Source code in inference_model/training/utils.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def to_lgbdataset(
    train: pd.DataFrame,
    cat_cols: List[str],
    target_col: str,
    id_cols: List[str] = [],
    valid: Optional[pd.DataFrame] = None,
) -> Tuple[lgbDataset, Optional[lgbDataset]]:
    """Transform pandas dataframe to lgbm dataset, or datasets(eval).

    Args:
        train (pd.DataFrame):
            training dataset
        cat_cols (list):
            list of categorical columns
        target_col (str):
            target column in the dataset
        id_cols (list):
            list of identifier columns
        valid (pd.DataFrame):
            validation dataset
    Returns:
        lgb_train (lgbDataset):
            lgbm training dataset
        lgb_valid (lgbDataset):
            lgbm valid dataset
    """

    lgb_train = lgbDataset(
        train.drop(columns=[target_col] + id_cols),
        train[target_col],
        categorical_feature=cat_cols,
        free_raw_data=False,
    )

    if valid is not None:
        lgb_valid = lgbDataset(
            valid.drop(columns=[target_col] + id_cols),
            valid[target_col],
            reference=lgb_train,
            free_raw_data=False,
        )
    else:
        lgb_valid = None

    return lgb_train, lgb_valid