Training¶

`aiqc(actual, high_quantile, low_quantile)` ¶

Average InterQuantile Coverage for Quantile Regression. Check if the interquantile coverage is close to the coverage of in the test dataset.

Parameters:

Name	Type	Description	Default
`actual`	`ndarray`	actual values	required
`high_quantile_predicted`	`ndarray`	high quantile predicted values	required
`low_quantile_predicted`	`ndarray`	low quantile predicted values	required

Source code in inference_model/training/metrics.py

def aiqc(actual: np.ndarray, high_quantile: np.ndarray, low_quantile: np.ndarray):
    """Average InterQuantile Coverage for Quantile Regression.
    Check if the interquantile coverage is close to the coverage of in the test dataset.

    Args:
        actual (np.ndarray): actual values
        high_quantile_predicted (np.ndarray): high quantile predicted values
        low_quantile_predicted (np.ndarray): low quantile predicted values
    Returns:
        aiqc (float): average interquantile coverage
    """
    return np.sum((actual < high_quantile) & (actual > low_quantile)) / len(actual)

`nacil(actual, high_quantile_predicted, low_quantile_predicted)` ¶

Normalized Average Confidence Interval Length for Quantile Regression. The value is normalized to actual(expected) value.

Parameters:

Name	Type	Description	Default
`actual`	`ndarray`	actual values	required
`high_quantile_predicted`	`ndarray`	high quantile predicted values	required
`low_quantile_predicted`	`ndarray`	low quantile predicted values	required

Source code in inference_model/training/metrics.py

def nacil(
    actual: np.ndarray,
    high_quantile_predicted: np.ndarray,
    low_quantile_predicted: np.ndarray,
):
    """Normalized Average Confidence Interval Length for Quantile Regression.
    The value is normalized to actual(expected) value.

    Args:
        actual (np.ndarray): actual values
        high_quantile_predicted (np.ndarray): high quantile predicted values
        low_quantile_predicted (np.ndarray): low quantile predicted values
    Returns:
        nacil (float): normalized average confidence interval length
    """
    return np.mean((high_quantile_predicted - low_quantile_predicted) / actual)

`ndcg(actual, predicted)` ¶

Normalized Discounted Cumulative Gain

Parameters:

Name	Type	Description	Default
`actual`	`ndarray`	actual values	required
`predicted`	`ndarray`	predicted values	required

Source code in inference_model/training/metrics.py

def ndcg(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Normalized Discounted Cumulative Gain

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Return:
        ndcg (float): normalized discounted cumulative gain
    """
    return ndcg_score(
        [rankdata(actual, method="ordinal")],
        [rankdata(predicted, method="ordinal")],
    )

`rmse(actual, predicted)` ¶

Root Mean Squared Error

Parameters:

Name	Type	Description	Default
`actual`	`ndarray`	actual values	required
`predicted`	`ndarray`	predicted values	required

Source code in inference_model/training/metrics.py

def rmse(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Root Mean Squared Error

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Returns:
        rmse (float): root mean square error
    """
    return mean_squared_error(actual, predicted, squared=False)

`smape(actual, predicted)` ¶

Symmetric Mean Absolute Percentage Error https://vedexcel.com/how-to-calculate-smape-in-python/

Parameters:

Name	Type	Description	Default
`actual`	`ndarray`	actual values	required
`predicted`	`ndarray`	predicted values	required

Source code in inference_model/training/metrics.py

def smape(actual: np.ndarray, predicted: Union[np.ndarray, list]) -> float:
    """Symmetric Mean Absolute Percentage Error
    https://vedexcel.com/how-to-calculate-smape-in-python/

    Args:
        actual (np.ndarray): actual values
        predicted (np.ndarray): predicted values
    Returns:
        smape (float): symmetric mean absolute percentage error
    """
    return (
        100
        / len(actual)
        * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))
    )

`LGBOptunaOptimizer` ¶

Bases: BaseOptimizer

Source code in inference_model/training/optuna_optimizer.py

class LGBOptunaOptimizer(BaseOptimizer):
    def __init__(
        self,
        objective: Literal["binary", "multiclass", "regression"],
        n_class: Optional[int] = None,
    ):
        """Fallback/backup Optuna optimizer. Development is focused on Raytune.
        Kepping this code as backup.

        Args:
            objective (str): objective of the model
            n_class (int): number of classes in the dataset
        """
        # Optuna does not support original lighgbm implemenattion and with a workaroud it
        # is possible to get the binarry classfier with focal_loss(any custom loss)
        # running but the multiclass requires changes to Optuna code, eg. this post:
        # https://lightrun.com/answers/optuna-optuna-error-when-using-custom-metrics-in-optunaintegrationlightgbm  # noqa
        # loss = None
        super(LGBOptunaOptimizer, self).__init__(objective, n_class)
        self.params = self.base_params

    def optimize(self, dtrain: lgbDataset, deval: lgbDataset):
        """Optimize LGBM model on provided datasets.

        Args:
            dtrain (lgbDataset): training lgb dataset
            deval (lgbDataset): evaluation lgb dataset
        """
        dtrain_copy = deepcopy(dtrain)
        deval_copy = deepcopy(deval) if deval is not None else None
        study = create_study(study_name="LightGBMTuner")

        tuner = LightGBMTuner(
            params=self.params,
            train_set=dtrain_copy,
            valid_sets=deval_copy,
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(stopping_rounds=50)],
            feval=self.feval,
            study=study,
            # fobj=self.fobj,
        )
        tuner.run()

        self.study = study
        self.best = tuner.best_params
        # since n_estimators is not among the params that Optuna optimizes we
        # need to add it manually. We add a high value since it will be used
        # with early_stopping_rounds
        self.best["n_estimators"] = 1000  # type: ignore

`init(objective, n_class=None)` ¶

Fallback/backup Optuna optimizer. Development is focused on Raytune. Kepping this code as backup.

Parameters:

Name	Type	Description	Default
`objective`	`str`	objective of the model	required
`n_class`	`int`	number of classes in the dataset	`None`

Source code in inference_model/training/optuna_optimizer.py

def __init__(
    self,
    objective: Literal["binary", "multiclass", "regression"],
    n_class: Optional[int] = None,
):
    """Fallback/backup Optuna optimizer. Development is focused on Raytune.
    Kepping this code as backup.

    Args:
        objective (str): objective of the model
        n_class (int): number of classes in the dataset
    """
    # Optuna does not support original lighgbm implemenattion and with a workaroud it
    # is possible to get the binarry classfier with focal_loss(any custom loss)
    # running but the multiclass requires changes to Optuna code, eg. this post:
    # https://lightrun.com/answers/optuna-optuna-error-when-using-custom-metrics-in-optunaintegrationlightgbm  # noqa
    # loss = None
    super(LGBOptunaOptimizer, self).__init__(objective, n_class)
    self.params = self.base_params

`optimize(dtrain, deval)` ¶

Optimize LGBM model on provided datasets.

Parameters:

Name	Type	Description	Default
`dtrain`	`Dataset`	training lgb dataset	required
`deval`	`Dataset`	evaluation lgb dataset	required

Source code in inference_model/training/optuna_optimizer.py

def optimize(self, dtrain: lgbDataset, deval: lgbDataset):
    """Optimize LGBM model on provided datasets.

    Args:
        dtrain (lgbDataset): training lgb dataset
        deval (lgbDataset): evaluation lgb dataset
    """
    dtrain_copy = deepcopy(dtrain)
    deval_copy = deepcopy(deval) if deval is not None else None
    study = create_study(study_name="LightGBMTuner")

    tuner = LightGBMTuner(
        params=self.params,
        train_set=dtrain_copy,
        valid_sets=deval_copy,
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
        feval=self.feval,
        study=study,
        # fobj=self.fobj,
    )
    tuner.run()

    self.study = study
    self.best = tuner.best_params
    # since n_estimators is not among the params that Optuna optimizes we
    # need to add it manually. We add a high value since it will be used
    # with early_stopping_rounds
    self.best["n_estimators"] = 1000  # type: ignore

`Trainer` ¶

Bases: BaseTrainer

Source code in inference_model/training/trainer.py

class Trainer(BaseTrainer):
    def __init__(
        self,
        cat_cols: List[str],
        target_col: str,
        id_cols: List[str],
        objective: Literal["binary"],
        optimizer: Union[Any, BaseOptimizer, None] = None,
        n_class: Optional[int] = None,
        preprocessors: Optional[List[Union[Any, PreprocessData]]] = None,
    ):
        """Objects that governs training and parameter optimization of the lgbm model.

        Args:
            cat_cols (list): list of categorical feature column names
            target_col (str): column name that represents target
            id_cols (list): identification column names
            objective (str): type of the task/objective
            optimizer (BaseOptimizer): parameter optimizer object
            n_class (int): number of classes in the dataset
            preprocessors (List[Union[Any, PreprocessData]]):
                ordered list of objects to preprocess dataset before optimization
                and training
        """
        super(Trainer, self).__init__(
            cat_cols=cat_cols,
            target_col=target_col,
            id_cols=id_cols,
            objective=objective,
            n_class=n_class,
            preprocessors=preprocessors,
        )
        if optimizer is not None:
            if not hasattr(optimizer, "optimize"):
                raise AttributeError(
                    "{} optimizer must have {} method".format(optimizer, "optimize")
                )
        self.optimizer = optimizer

    def train(
        self,
        df_train: pd.DataFrame,
        params: Optional[Dict] = None,
        df_valid: Optional[pd.DataFrame] = None,
    ):
        """Train the model with the parameters.

        Args:
            df_train (pd.DataFrame): training dataset
            params (dict): model paramaters
            df_valid (pd.DataFrame): optional validation dataset
        Returns:
            model (lgb.basic.Booster): trained model
        """
        if self.preprocessors:
            for prep in self.preprocessors:
                df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
                df_valid_prep = (
                    prep.transform(df_valid.drop(columns=[self.target_col]))
                    if df_valid is not None
                    else None
                )
            if self.objective in ["binary"]:
                df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
                if df_valid is not None:
                    df_valid_prep[self.target_col] = df_valid[self.target_col].astype(
                        int
                    )
            else:
                raise NotImplementedError
        else:
            df_train_prep = df_train.copy()
            df_valid_prep = df_valid.copy() if df_valid is not None else None

        cat_cols = self.cat_cols
        if params:
            config = params
        elif (params is None) and (self.optimizer):
            config = self.optimizer.best
            if hasattr(self.optimizer, "best_to_drop"):
                df_train_prep.drop(
                    columns=self.optimizer.best_to_drop,  # type: ignore
                    inplace=True,
                )
                if df_valid is not None:
                    df_valid_prep.drop(
                        columns=self.optimizer.best_to_drop,  # type: ignore
                        inplace=True,
                    )
                cat_cols = intsec(self.cat_cols, df_train_prep.columns.values)

        elif (params is None) and (self.optimizer is None):
            config = self.base_params

        lgb_train, lgb_valid = to_lgbdataset(
            train=df_train_prep,
            cat_cols=cat_cols,
            target_col=self.target_col,
            id_cols=self.id_cols,
            valid=df_valid_prep,
        )

        self.model = lgb_train_function(
            config=config,
            lgbtrain=lgb_train,
            lgbeval=lgb_valid,
            feval=self.feval,
        )

    def fit(
        self,
        df_train: pd.DataFrame,
        df_valid: pd.DataFrame,
        df_test: pd.DataFrame,
        random_state: int = 1,
    ) -> pd.DataFrame:
        """Train the model and optimize the parameters.

        Args:
            df_train (pd.DataFrame): training dataset
            df_valid (pd.DataFrame): validation dataset
            df_test (pd.DataFrame): testing dataset
        Returns:
            model (lgb.basic.Booster): trained mdoel
        """
        if self.preprocessors:
            # this should be
            # df_train_prep = prep.transform(df_train_prep.drop(columns=[self.target_col]))
            for prep in self.preprocessors:
                df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
                df_valid_prep = prep.transform(df_valid.drop(columns=[self.target_col]))
            if self.objective in ["binary"]:
                df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
                df_valid_prep[self.target_col] = df_valid[self.target_col].astype(int)
            else:
                raise NotImplementedError
        else:
            df_train_prep = df_train.copy()
            df_valid_prep = df_valid.copy()

        lgb_train, lgb_valid = to_lgbdataset(
            train=df_train_prep,
            cat_cols=self.cat_cols,
            target_col=self.target_col,
            id_cols=self.id_cols,
            valid=df_valid_prep,
        )
        self.optimizer.optimize(dtrain=lgb_train, deval=lgb_valid)

        df_train_valid = pd.concat([df_train, df_valid], ignore_index=True)
        df_train_valid = df_train_valid.reset_index(drop=True)
        self.train(df_train=df_train_valid)
        metrics_dict = self.compute_metrics(df_test)
        return metrics_dict

`init(cat_cols, target_col, id_cols, objective, optimizer=None, n_class=None, preprocessors=None)` ¶

Objects that governs training and parameter optimization of the lgbm model.

Parameters:

Name	Type	Description	Default
`cat_cols`	`list`	list of categorical feature column names	required
`target_col`	`str`	column name that represents target	required
`id_cols`	`list`	identification column names	required
`objective`	`str`	type of the task/objective	required
`optimizer`	`BaseOptimizer`	parameter optimizer object	`None`
`n_class`	`int`	number of classes in the dataset	`None`
`preprocessors`	`List[Union[Any, PreprocessData]]`	ordered list of objects to preprocess dataset before optimization and training	`None`

Source code in inference_model/training/trainer.py

def __init__(
    self,
    cat_cols: List[str],
    target_col: str,
    id_cols: List[str],
    objective: Literal["binary"],
    optimizer: Union[Any, BaseOptimizer, None] = None,
    n_class: Optional[int] = None,
    preprocessors: Optional[List[Union[Any, PreprocessData]]] = None,
):
    """Objects that governs training and parameter optimization of the lgbm model.

    Args:
        cat_cols (list): list of categorical feature column names
        target_col (str): column name that represents target
        id_cols (list): identification column names
        objective (str): type of the task/objective
        optimizer (BaseOptimizer): parameter optimizer object
        n_class (int): number of classes in the dataset
        preprocessors (List[Union[Any, PreprocessData]]):
            ordered list of objects to preprocess dataset before optimization
            and training
    """
    super(Trainer, self).__init__(
        cat_cols=cat_cols,
        target_col=target_col,
        id_cols=id_cols,
        objective=objective,
        n_class=n_class,
        preprocessors=preprocessors,
    )
    if optimizer is not None:
        if not hasattr(optimizer, "optimize"):
            raise AttributeError(
                "{} optimizer must have {} method".format(optimizer, "optimize")
            )
    self.optimizer = optimizer

`fit(df_train, df_valid, df_test, random_state=1)` ¶

Train the model and optimize the parameters.

Parameters:

Name	Type	Description	Default
`df_train`	`DataFrame`	training dataset	required
`df_valid`	`DataFrame`	validation dataset	required
`df_test`	`DataFrame`	testing dataset	required

Source code in inference_model/training/trainer.py

def fit(
    self,
    df_train: pd.DataFrame,
    df_valid: pd.DataFrame,
    df_test: pd.DataFrame,
    random_state: int = 1,
) -> pd.DataFrame:
    """Train the model and optimize the parameters.

    Args:
        df_train (pd.DataFrame): training dataset
        df_valid (pd.DataFrame): validation dataset
        df_test (pd.DataFrame): testing dataset
    Returns:
        model (lgb.basic.Booster): trained mdoel
    """
    if self.preprocessors:
        # this should be
        # df_train_prep = prep.transform(df_train_prep.drop(columns=[self.target_col]))
        for prep in self.preprocessors:
            df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
            df_valid_prep = prep.transform(df_valid.drop(columns=[self.target_col]))
        if self.objective in ["binary"]:
            df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
            df_valid_prep[self.target_col] = df_valid[self.target_col].astype(int)
        else:
            raise NotImplementedError
    else:
        df_train_prep = df_train.copy()
        df_valid_prep = df_valid.copy()

    lgb_train, lgb_valid = to_lgbdataset(
        train=df_train_prep,
        cat_cols=self.cat_cols,
        target_col=self.target_col,
        id_cols=self.id_cols,
        valid=df_valid_prep,
    )
    self.optimizer.optimize(dtrain=lgb_train, deval=lgb_valid)

    df_train_valid = pd.concat([df_train, df_valid], ignore_index=True)
    df_train_valid = df_train_valid.reset_index(drop=True)
    self.train(df_train=df_train_valid)
    metrics_dict = self.compute_metrics(df_test)
    return metrics_dict

`train(df_train, params=None, df_valid=None)` ¶

Train the model with the parameters.

Parameters:

Name	Type	Description	Default
`df_train`	`DataFrame`	training dataset	required
`params`	`dict`	model paramaters	`None`
`df_valid`	`DataFrame`	optional validation dataset	`None`

Source code in inference_model/training/trainer.py

def train(
    self,
    df_train: pd.DataFrame,
    params: Optional[Dict] = None,
    df_valid: Optional[pd.DataFrame] = None,
):
    """Train the model with the parameters.

    Args:
        df_train (pd.DataFrame): training dataset
        params (dict): model paramaters
        df_valid (pd.DataFrame): optional validation dataset
    Returns:
        model (lgb.basic.Booster): trained model
    """
    if self.preprocessors:
        for prep in self.preprocessors:
            df_train_prep = prep.transform(df_train.drop(columns=[self.target_col]))
            df_valid_prep = (
                prep.transform(df_valid.drop(columns=[self.target_col]))
                if df_valid is not None
                else None
            )
        if self.objective in ["binary"]:
            df_train_prep[self.target_col] = df_train[self.target_col].astype(int)
            if df_valid is not None:
                df_valid_prep[self.target_col] = df_valid[self.target_col].astype(
                    int
                )
        else:
            raise NotImplementedError
    else:
        df_train_prep = df_train.copy()
        df_valid_prep = df_valid.copy() if df_valid is not None else None

    cat_cols = self.cat_cols
    if params:
        config = params
    elif (params is None) and (self.optimizer):
        config = self.optimizer.best
        if hasattr(self.optimizer, "best_to_drop"):
            df_train_prep.drop(
                columns=self.optimizer.best_to_drop,  # type: ignore
                inplace=True,
            )
            if df_valid is not None:
                df_valid_prep.drop(
                    columns=self.optimizer.best_to_drop,  # type: ignore
                    inplace=True,
                )
            cat_cols = intsec(self.cat_cols, df_train_prep.columns.values)

    elif (params is None) and (self.optimizer is None):
        config = self.base_params

    lgb_train, lgb_valid = to_lgbdataset(
        train=df_train_prep,
        cat_cols=cat_cols,
        target_col=self.target_col,
        id_cols=self.id_cols,
        valid=df_valid_prep,
    )

    self.model = lgb_train_function(
        config=config,
        lgbtrain=lgb_train,
        lgbeval=lgb_valid,
        feval=self.feval,
    )

`flatten_dict(d, parent_key='', sep='_')` ¶

fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/

Source code in inference_model/training/utils.py

def flatten_dict(
    d: MutableMapping, parent_key: str = "", sep: str = "_"
) -> MutableMapping:
    """
    fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
    """
    items: List[Any] = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

`get_feature_importance(model)` ¶

Extract model feature importances and return sorted dataframe.

Parameters:

Name	Type	Description	Default
`model`	`Booster`	LightGBM model	required

Returns:

Name	Type	Description
`feature_imp`	`DataFrame`	sorted dataframe with features and their importances

Source code in inference_model/training/utils.py

def get_feature_importance(model: lgb.basic.Booster) -> pd.DataFrame:
    """Extract model feature importances and return sorted dataframe.

    Args:
        model (lgb.basic.Booster): LightGBM model

    Returns:
        feature_imp (pd.DataFrame): sorted dataframe with features and their
            importances
    """
    feature_imp = pd.DataFrame(
        {"value": model.feature_importance(), "feature": model.feature_name()}
    )
    feature_imp.sort_values(by="value", inplace=True)
    feature_imp.reset_index(drop=True, inplace=True)
    return feature_imp

`get_or_create_experiment(experiment_name)` ¶

Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

This function checks if an experiment with the given name exists within MLflow. If it does, the function returns its ID. If not, it creates a new experiment with the provided name and returns its ID.

Parameters: - experiment_name (str): Name of the MLflow experiment.

Returns: - str: ID of the existing or newly created MLflow experiment.

Source code in inference_model/training/utils.py

def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

`predict_cls_lgbm_from_raw(preds_raw, task)` ¶

Helper function to convert raw margin predictions through a sigmoid to represent a probability.

Parameters:

Name	Type	Description	Default
`preds_raw`	`ndarray`	predictions	required
`lgbDataset`	`Dataset`	dataset, containing labels, used for prediction	required

Source code in inference_model/training/utils.py

def predict_cls_lgbm_from_raw(
    preds_raw: np.ndarray, task: Literal["binary", "multiclass"]
) -> np.ndarray:
    """Helper function to convert raw margin predictions through a
    sigmoid to represent a probability.

    Args:
        preds_raw (ndarray):
            predictions
        lgbDataset (lightgbm.Dataset):
            dataset, containing labels, used for prediction
    Returns:
        (y_true, preds):
            tuple containg labels and predictions for further evaluation
    """
    predicted_probs = predict_proba_lgbm_from_raw(preds_raw=preds_raw, task=task)
    if task == "binary":
        pred_cls = np.array([int(p > 0.5) for p in predicted_probs])
    elif task == "multiclass":
        pred_cls = predicted_probs.argmax(axis=1)

    return pred_cls

`predict_proba_lgbm_from_raw(preds_raw, task, binary2d=False)` ¶

Apply softmax to array of arrays that is an output of lightgbm.predct(). This replaces predict_proba().

Parameters:

Name	Type	Description	Default
`predicted_raw`	`ndarray`	1D numpy array of arrays	required
`task`	`str`	type of task/objective	required
`binary2d`	`boolean`	wether the output of binary classification should be 1 or 2d vector	`False`

Source code in inference_model/training/utils.py

def predict_proba_lgbm_from_raw(
    preds_raw: np.ndarray,
    task: Literal["binary", "multiclass"],
    binary2d: Optional[bool] = False,
) -> np.ndarray:
    """Apply softmax to array of arrays that is an output of lightgbm.predct().
    This replaces predict_proba().

    Args:
        predicted_raw (ndarray):
            1D numpy array of arrays
        task (str):
            type of task/objective
        binary2d (boolean):
            wether the output of binary classification should be 1 or 2d vector
    Returns:
        predicted_probs (ndarray): array with predicted probabilities
    """
    if task == "binary":
        predicted_probs = _sigmoid(preds_raw)
        if binary2d:
            predicted_probs = np.apply_along_axis(
                _binary_margin_to_prob, 1, np.vstack(tuple(predicted_probs))
            )
    elif task == "multiclass":
        predicted_probs = np.apply_along_axis(_softmax, 1, np.stack(tuple(preds_raw)))
    return predicted_probs

`to_lgbdataset(train, cat_cols, target_col, id_cols=[], valid=None)` ¶

Transform pandas dataframe to lgbm dataset, or datasets(eval).

Parameters:

Name	Type	Description	Default
`train`	`DataFrame`	training dataset	required
`cat_cols`	`list`	list of categorical columns	required
`target_col`	`str`	target column in the dataset	required
`id_cols`	`list`	list of identifier columns	`[]`
`valid`	`DataFrame`	validation dataset	`None`

Source code in inference_model/training/utils.py

def to_lgbdataset(
    train: pd.DataFrame,
    cat_cols: List[str],
    target_col: str,
    id_cols: List[str] = [],
    valid: Optional[pd.DataFrame] = None,
) -> Tuple[lgbDataset, Optional[lgbDataset]]:
    """Transform pandas dataframe to lgbm dataset, or datasets(eval).

    Args:
        train (pd.DataFrame):
            training dataset
        cat_cols (list):
            list of categorical columns
        target_col (str):
            target column in the dataset
        id_cols (list):
            list of identifier columns
        valid (pd.DataFrame):
            validation dataset
    Returns:
        lgb_train (lgbDataset):
            lgbm training dataset
        lgb_valid (lgbDataset):
            lgbm valid dataset
    """

    lgb_train = lgbDataset(
        train.drop(columns=[target_col] + id_cols),
        train[target_col],
        categorical_feature=cat_cols,
        free_raw_data=False,
    )

    if valid is not None:
        lgb_valid = lgbDataset(
            valid.drop(columns=[target_col] + id_cols),
            valid[target_col],
            reference=lgb_train,
            free_raw_data=False,
        )
    else:
        lgb_valid = None

    return lgb_train, lgb_valid

Training¶

aiqc(actual, high_quantile, low_quantile) ¶

nacil(actual, high_quantile_predicted, low_quantile_predicted) ¶

ndcg(actual, predicted) ¶

rmse(actual, predicted) ¶

smape(actual, predicted) ¶

LGBOptunaOptimizer ¶

__init__(objective, n_class=None) ¶

optimize(dtrain, deval) ¶

Trainer ¶

__init__(cat_cols, target_col, id_cols, objective, optimizer=None, n_class=None, preprocessors=None) ¶

fit(df_train, df_valid, df_test, random_state=1) ¶

train(df_train, params=None, df_valid=None) ¶

flatten_dict(d, parent_key='', sep='_') ¶

get_feature_importance(model) ¶

get_or_create_experiment(experiment_name) ¶

predict_cls_lgbm_from_raw(preds_raw, task) ¶

predict_proba_lgbm_from_raw(preds_raw, task, binary2d=False) ¶

to_lgbdataset(train, cat_cols, target_col, id_cols=[], valid=None) ¶

`aiqc(actual, high_quantile, low_quantile)` ¶

`nacil(actual, high_quantile_predicted, low_quantile_predicted)` ¶

`ndcg(actual, predicted)` ¶

`rmse(actual, predicted)` ¶

`smape(actual, predicted)` ¶

`LGBOptunaOptimizer` ¶

`init(objective, n_class=None)` ¶

`optimize(dtrain, deval)` ¶

`Trainer` ¶

`init(cat_cols, target_col, id_cols, objective, optimizer=None, n_class=None, preprocessors=None)` ¶

`fit(df_train, df_valid, df_test, random_state=1)` ¶

`train(df_train, params=None, df_valid=None)` ¶

`flatten_dict(d, parent_key='', sep='_')` ¶

`get_feature_importance(model)` ¶

`get_or_create_experiment(experiment_name)` ¶

`predict_cls_lgbm_from_raw(preds_raw, task)` ¶

`predict_proba_lgbm_from_raw(preds_raw, task, binary2d=False)` ¶

`to_lgbdataset(train, cat_cols, target_col, id_cols=[], valid=None)` ¶