Models¶

Imports¶

In [1]:

Copied!

import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [2]:

Copied!





import dill
import numpy as np
import matplotlib.pyplot as plt

import h2o
from h2o.automl import H2OAutoML

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression

import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
    predict_cls_lgbm_from_raw,
    predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast

# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()
import dill
import numpy as np
import matplotlib.pyplot as plt

import h2o
from h2o.automl import H2OAutoML

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression

import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
    predict_cls_lgbm_from_raw,
    predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast

# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()

Dataset¶

In [3]:

Copied!





valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True
valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True

In [4]:

Copied!





# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
    data = ast.literal_eval(f.read())

df_pd = pd.DataFrame()
for data_value in data:
    temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
        by="ts_millis:", ascending=True
    )["value"]
    temp_df.rename(list(data_value)[0], inplace=True)
    df_pd = pd.concat([df_pd, temp_df], axis=1)

df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100

df_pd.head()
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
    data = ast.literal_eval(f.read())

df_pd = pd.DataFrame()
for data_value in data:
    temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
        by="ts_millis:", ascending=True
    )["value"]
    temp_df.rename(list(data_value)[0], inplace=True)
    df_pd = pd.concat([df_pd, temp_df], axis=1)

df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100

df_pd.head()

Out[4]:

	engine_load	engine_coolant_temp	engine_speed	intake_air_temp	maf	throttle_position	fuel_rg_pressure	barometric_pressure	control_voltage	class	car_id
0	100.0	17.0	904.5	10.0	12.55	83.14	37270.0	101.0	0.06	1	123abc
1	100.0	17.0	906.0	11.0	12.36	83.14	37800.0	101.0	14.56	1	123abc
2	100.0	17.0	905.0	10.0	12.36	83.53	37800.0	101.0	14.68	1	123abc
3	100.0	18.0	905.5	11.0	12.30	83.53	37800.0	101.0	14.72	1	123abc
4	100.0	18.0	907.0	11.0	12.47	83.14	36740.0	101.0	14.72	1	123abc

In [8]:

Copied!





target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

check possible class imbalance

In [9]:

Copied!

df_pd[target_col].value_counts()
df_pd[target_col].value_counts()

Out[9]:

class
0    934
1    101
Name: count, dtype: int64

Preprocessing¶

divide dataset into train, test, valid
scale continuous columns by standard scaler(not needed for LightGBM but for other mdoels...)

In [10]:

Copied!





df_train, df_valid = train_test_split(
    df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[target_col],
    random_state=random_state,
)

df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

scaler_mapper_def = {
    "cont_cols": StandardScaler,
    "cat_cols": None,
    "id_cols": None,
}
scaler = scaler_mapper(
    cont_cols=cont_cols,
    cat_cols=cat_cols,
    id_cols=[target_col] + id_cols,
    scaler_mapper_def=scaler_mapper_def,
)

df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)
df_train, df_valid = train_test_split(
    df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[target_col],
    random_state=random_state,
)

df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

scaler_mapper_def = {
    "cont_cols": StandardScaler,
    "cat_cols": None,
    "id_cols": None,
}
scaler = scaler_mapper(
    cont_cols=cont_cols,
    cat_cols=cat_cols,
    id_cols=[target_col] + id_cols,
    scaler_mapper_def=scaler_mapper_def,
)

df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)

Logistic Regression¶

In [11]:

Copied!





LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
    df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
    df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
    df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)

print(
    "LR classification report :\n"
    + str(classification_report(df_test_scaled[target_col], LR_predicted))
)
LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
    df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
    df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
    df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)

print(
    "LR classification report :\n"
    + str(classification_report(df_test_scaled[target_col], LR_predicted))
)

LR classification report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00        10

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104

In [12]:

Copied!





# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score

criterion = f1_score

threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
    # preds_bin = [int(p > t) for p in y_pred]
    preds_bin = (
        LR_clf.predict_proba(
            df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
        )[:, 1]
        >= t
    ).astype(int)
    threshold_score.append(
        (t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
    )

threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]

print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")
# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score

criterion = f1_score

threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
    # preds_bin = [int(p > t) for p in y_pred]
    preds_bin = (
        LR_clf.predict_proba(
            df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
        )[:, 1]
        >= t
    ).astype(int)
    threshold_score.append(
        (t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
    )

threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]

print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")

The best threshold
0.2
, with score:
1.0

In [13]:

Copied!





y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
    df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
    df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)

In [14]:

Copied!

print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")

Per class AUC:
[1. 1.]
Weighted AUC:
1.0

H2O AutoML¶

NOTE:

java is required: sudo apt install openjdk-17-jdk

In [16]:

Copied!





# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")

# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))

# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()

# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)

# over/under sample for classification tasks
aml.balance_classes = True

# Run it
_ = aml.train(
    x=list(cont_cols + cat_cols),
    y=target_col,
    training_frame=h2o_train,
    leaderboard_frame=h2o_valid,
)

m = aml.get_best_model()

# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)

# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")
# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")

# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))

# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()

# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)

# over/under sample for classification tasks
aml.balance_classes = True

# Run it
_ = aml.train(
    x=list(cont_cols + cat_cols),
    y=target_col,
    training_frame=h2o_train,
    leaderboard_frame=h2o_valid,
)

m = aml.get_best_model()

# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)

# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.10" 2024-01-16; OpenJDK Runtime Environment (build 17.0.10+7-Ubuntu-122.04.1); OpenJDK 64-Bit Server VM (build 17.0.10+7-Ubuntu-122.04.1, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp0jm6weez
  JVM stdout: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


H2O_cluster_uptime:	01 secs
H2O_cluster_timezone:	Etc/UTC
H2O_data_parsing_timezone:	UTC
H2O_cluster_version:	3.46.0.2
H2O_cluster_version_age:	8 days
H2O_cluster_name:	H2O_from_python_unknownUser_maoqtf
H2O_cluster_total_nodes:	1
H2O_cluster_free_memory:	16 Gb
H2O_cluster_total_cores:	12
H2O_cluster_allowed_cores:	12
H2O_cluster_status:	locked, healthy
H2O_connection_url:	http://127.0.0.1:54321
H2O_connection_proxy:	{"http": null, "https": null}
H2O_internal_security:	False
Python_version:	3.10.11 final

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
13:16:41.801: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:16:44.939: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:16:46.154: _train param, Dropping bad and constant columns: [barometric_pressure]

█████
13:16:59.224: _train param, Dropping unused columns: [barometric_pressure]
13:16:59.913: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:17:03.240: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:17:05.128: _train param, Dropping bad and constant columns: [barometric_pressure]

███
13:17:16.407: _train param, Dropping bad and constant columns: [barometric_pressure]

████
13:17:27.855: _train param, Dropping bad and constant columns: [barometric_pressure]

█████
13:17:40.196: _train param, Dropping unused columns: [barometric_pressure]
13:17:40.764: _train param, Dropping unused columns: [barometric_pressure]


13:17:41.338: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:17:44.575: _train param, Dropping bad and constant columns: [barometric_pressure]


13:17:45.413: _train param, Dropping bad and constant columns: [barometric_pressure]

███
13:17:53.937: _train param, Dropping bad and constant columns: [barometric_pressure]

█
13:17:55.712: _train param, Dropping unused columns: [barometric_pressure]


13:17:56.367: _train param, Dropping unused columns: [barometric_pressure]

██████████████████████████████████
13:19:35.448: _train param, Dropping unused columns: [barometric_pressure]

█
13:19:36.22: _train param, Dropping unused columns: [barometric_pressure]

██| (done) 100%
model_id                                                auc      logloss    aucpr    mean_per_class_error         rmse          mse    training_time_ms    predict_time_per_row_ms  algo
DeepLearning_grid_1_AutoML_1_20240522_131641_model_8      1  0.0177159          1                       0  0.0582198    0.00338955                  628                   0.127786  DeepLearning
DeepLearning_grid_1_AutoML_1_20240522_131641_model_4      1  0.0155053          1                       0  0.0481989    0.00232314                  117                   0.1101    DeepLearning
XGBoost_1_AutoML_1_20240522_131641                        1  0.0347672          1                       0  0.0789026    0.00622562                  464                   0.078661  XGBoost
XGBoost_grid_1_AutoML_1_20240522_131641_model_12          1  0.0651884          1                       0  0.106454     0.0113325                   328                   0.072441  XGBoost
DeepLearning_grid_1_AutoML_1_20240522_131641_model_1      1  0.0235103          1                       0  0.0600679    0.00360815                   99                   0.098096  DeepLearning
XGBoost_grid_1_AutoML_1_20240522_131641_model_11          1  0.00556141         1                       0  0.0320985    0.00103031                  996                   0.105708  XGBoost
GBM_4_AutoML_1_20240522_131641                            1  9.26741e-06        1                       0  9.39779e-05  8.83184e-09                4293                   0.280377  GBM
XGBoost_grid_1_AutoML_1_20240522_131641_model_6           1  0.0141646          1                       0  0.0244465    0.000597631                 630                   0.057813  XGBoost
XGBoost_grid_1_AutoML_1_20240522_131641_model_13          1  0.0318374          1                       0  0.0810283    0.00656559                  745                   0.069021  XGBoost
GBM_3_AutoML_1_20240522_131641                            1  2.55592e-05        1                       0  0.000259039  6.7101e-08                 4037                   0.153118  GBM
[47 rows x 10 columns]

ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.0033895466349161047
RMSE: 0.05821981307867713
LogLoss: 0.017715874610611436
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9979014979275557
       0    1    Error    Rate
-----  ---  ---  -------  -----------
0      93   0    0        (0.0/93.0)
1      0    10   0        (0.0/10.0)
Total  93   10   0        (0.0/103.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value    idx
---------------------------  -----------  -------  -----
max f1                       0.997901     1        9
max f2                       0.997901     1        9
max f0point5                 0.997901     1        9
max accuracy                 0.997901     1        9
max precision                0.999961     1        0
max recall                   0.997901     1        9
max specificity              0.999961     1        0
max absolute_mcc             0.997901     1        9
max min_per_class_accuracy   0.997901     1        9
max mean_per_class_accuracy  0.997901     1        9
max tns                      0.999961     93       0
max fns                      0.999961     9        0
max fps                      3.49717e-09  93       102
max tps                      0.997901     10       9
max tnr                      0.999961     1        0
max fnr                      0.999961     0.9      0
max fpr                      3.49717e-09  1        102
max tpr                      0.997901     1        9

Gains/Lift Table: Avg response rate:  9.71 %, avg score: 11.26 %
group    cumulative_data_fraction    lower_threshold    lift    cumulative_lift    response_rate    score        cumulative_response_rate    cumulative_score    capture_rate    cumulative_capture_rate    gain    cumulative_gain    kolmogorov_smirnov
-------  --------------------------  -----------------  ------  -----------------  ---------------  -----------  --------------------------  ------------------  --------------  -------------------------  ------  -----------------  --------------------
1        0.0194175                   0.999958           10.3    10.3               1                0.999959     1                           0.999959            0.2             0.2                        930     930                0.2
2        0.0291262                   0.999947           10.3    10.3               1                0.999947     1                           0.999955            0.1             0.3                        930     930                0.3
3        0.038835                    0.999937           10.3    10.3               1                0.999938     1                           0.999951            0.1             0.4                        930     930                0.4
4        0.0485437                   0.999927           10.3    10.3               1                0.99993      1                           0.999947            0.1             0.5                        930     930                0.5
5        0.0582524                   0.999888           10.3    10.3               1                0.999889     1                           0.999937            0.1             0.6                        930     930                0.6
6        0.106796                    0.372647           8.24    9.36364            0.8              0.879862     0.909091                    0.945358            0.4             1                          724     836.364            0.989247
7        0.15534                     0.10825            0       6.4375             0                0.168191     0.625                       0.702493            0               1                          -100    543.75             0.935484
8        0.203883                    0.00316095         0       4.90476            0                0.0682301    0.47619                     0.551478            0               1                          -100    390.476            0.88172
9        0.300971                    0.000343083        0       3.32258            0                0.00100323   0.322581                    0.373906            0               1                          -100    232.258            0.774194
10       0.398058                    0.000119786        0       2.5122             0                0.000202804  0.243902                    0.282759            0               1                          -100    151.22             0.666667
11       0.504854                    8.52118e-05        0       1.98077            0                0.000102754  0.192308                    0.222966            0               1                          -100    98.0769            0.548387
12       0.601942                    6.24023e-05        0       1.66129            0                7.12997e-05  0.16129                     0.187015            0               1                          -100    66.129             0.44086
13       0.699029                    4.02512e-05        0       1.43056            0                5.12346e-05  0.138889                    0.161048            0               1                          -100    43.0556            0.333333
14       0.796117                    2.31994e-05        0       1.2561             0                3.1514e-05   0.121951                    0.141412            0               1                          -100    25.6098            0.225806
15       0.893204                    1.36306e-05        0       1.11957            0                1.91168e-05  0.108696                    0.126043            0               1                          -100    11.9565            0.11828
16       1                           3.49717e-09        0       1                  0                5.41845e-06  0.0970874                   0.112583            0               1                          -100    0                  0
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%

In [17]:

Copied!





y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)

/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:

with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()

  warnings.warn("Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using"

In [18]:

Copied!

print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")

Per class AUC:
[1. 1.]
Weighted AUC:
1.0

In [19]:

Copied!





fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2

for class_id in range(n_classes):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_score[:, class_id],
        name=f"ROC curve for class {class_id}",
        ax=ax,
        # plot_chance_level=(class_id == 2),
    )
fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2

for class_id in range(n_classes):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_score[:, class_id],
        name=f"ROC curve for class {class_id}",
        ax=ax,
        # plot_chance_level=(class_id == 2),
    )

No description has been provided for this image

In [20]:

Copied!





print(
    "Classification report:\n{}".format(
        classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
    )
)
print(
    "Classification report:\n{}".format(
        classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
    )
)

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00        10

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104

/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:

with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()

  warnings.warn("Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using"

LightGBM¶

In [21]:

Copied!





config = {}
config["objective"] = "binary"

df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols

label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)

lgbtrain = lgbm.Dataset(
    df_train_scaled_enc.drop(columns=[target_col] + id_cols),
    df_train_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
    df_valid_scaled_enc[target_col],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target_col] + id_cols),
    ftrain[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    df_test_scaled_enc.drop(columns=[target_col] + id_cols),
    df_test_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)
config = {}
config["objective"] = "binary"

df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols

label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)

lgbtrain = lgbm.Dataset(
    df_train_scaled_enc.drop(columns=[target_col] + id_cols),
    df_train_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
    df_valid_scaled_enc[target_col],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target_col] + id_cols),
    ftrain[target_col],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    df_test_scaled_enc.drop(columns=[target_col] + id_cols),
    df_test_scaled_enc[target_col],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)

In [22]:

Copied!





model = lgbm.train(
    config,
    lgbtrain,
    valid_sets=[lgbvalid],
    valid_names=[""],
    feval=None,
)
model = lgbm.train(
    config,
    lgbtrain,
    valid_sets=[lgbvalid],
    valid_names=[""],
    feval=None,
)

[LightGBM] [Info] Number of positive: 81, number of negative: 747
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 467
[LightGBM] [Info] Number of data points in the train set: 828, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097826 -> initscore=-2.221616
[LightGBM] [Info] Start training from score -2.221616
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

In [23]:

Copied!





res = predict_cls_lgbm_from_raw(
    preds_raw=model.predict(lgbtest.data, raw_score=True),
    task="binary",
)

result = pd.DataFrame(
    {
        "predicted": res,
        "ground_truth": df_test[target_col].values,
    }
)

print(
    "Classification report:\n{}".format(
        classification_report(result["ground_truth"], result["predicted"])
    )
)
res = predict_cls_lgbm_from_raw(
    preds_raw=model.predict(lgbtest.data, raw_score=True),
    task="binary",
)

result = pd.DataFrame(
    {
        "predicted": res,
        "ground_truth": df_test[target_col].values,
    }
)

print(
    "Classification report:\n{}".format(
        classification_report(result["ground_truth"], result["predicted"])
    )
)

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00        10

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104

In [24]:

Copied!





y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
    preds_raw=model.predict(lgbtest.data),
    task="binary",
    binary2d=True,
)

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)

print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
    preds_raw=model.predict(lgbtest.data),
    task="binary",
    binary2d=True,
)

micro_roc_auc_per_class = roc_auc_score(
    y_onehot_test,
    y_score,
    average=None,
)

micro_roc_auc_weighted = roc_auc_score(
    y_onehot_test,
    y_score,
    average="weighted",
)

print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")

Per class AUC:
[1. 1.]
Weighted AUC:
1.0