Models¶
Imports¶
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
In [2]:
Copied!
import dill
import numpy as np
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
predict_cls_lgbm_from_raw,
predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast
# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()
import dill
import numpy as np
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
predict_cls_lgbm_from_raw,
predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast
# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()
Dataset¶
In [3]:
Copied!
valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True
valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True
In [4]:
Copied!
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
Out[4]:
| engine_load | engine_coolant_temp | engine_speed | vehicle_speed | intake_air_temp | maf | throttle_position | fuel_rg_pressure | barometric_pressure | control_voltage | class | car_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100.0 | 17.0 | 904.5 | 0.0 | 10.0 | 12.55 | 83.14 | 37270.0 | 101.0 | 0.06 | 1 | 123abc |
| 1 | 100.0 | 17.0 | 906.0 | 0.0 | 11.0 | 12.36 | 83.14 | 37800.0 | 101.0 | 14.56 | 1 | 123abc |
| 2 | 100.0 | 17.0 | 905.0 | 0.0 | 10.0 | 12.36 | 83.53 | 37800.0 | 101.0 | 14.68 | 1 | 123abc |
| 3 | 100.0 | 18.0 | 905.5 | 0.0 | 11.0 | 12.30 | 83.53 | 37800.0 | 101.0 | 14.72 | 1 | 123abc |
| 4 | 100.0 | 18.0 | 907.0 | 0.0 | 11.0 | 12.47 | 83.14 | 36740.0 | 101.0 | 14.72 | 1 | 123abc |
In [8]:
Copied!
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
check possible class imbalance
In [9]:
Copied!
df_pd[target_col].value_counts()
df_pd[target_col].value_counts()
Out[9]:
class 0 934 1 101 Name: count, dtype: int64
Preprocessing¶
- divide dataset into train, test, valid
- scale continuous columns by standard scaler(not needed for LightGBM but for other mdoels...)
In [10]:
Copied!
df_train, df_valid = train_test_split(
df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[target_col],
random_state=random_state,
)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
scaler_mapper_def = {
"cont_cols": StandardScaler,
"cat_cols": None,
"id_cols": None,
}
scaler = scaler_mapper(
cont_cols=cont_cols,
cat_cols=cat_cols,
id_cols=[target_col] + id_cols,
scaler_mapper_def=scaler_mapper_def,
)
df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)
df_train, df_valid = train_test_split(
df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[target_col],
random_state=random_state,
)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
scaler_mapper_def = {
"cont_cols": StandardScaler,
"cat_cols": None,
"id_cols": None,
}
scaler = scaler_mapper(
cont_cols=cont_cols,
cat_cols=cat_cols,
id_cols=[target_col] + id_cols,
scaler_mapper_def=scaler_mapper_def,
)
df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)
Logistic Regression¶
In [11]:
Copied!
LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
print(
"LR classification report :\n"
+ str(classification_report(df_test_scaled[target_col], LR_predicted))
)
LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
print(
"LR classification report :\n"
+ str(classification_report(df_test_scaled[target_col], LR_predicted))
)
LR classification report :
precision recall f1-score support
0 1.00 1.00 1.00 94
1 1.00 1.00 1.00 10
accuracy 1.00 104
macro avg 1.00 1.00 1.00 104
weighted avg 1.00 1.00 1.00 104
In [12]:
Copied!
# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score
criterion = f1_score
threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
# preds_bin = [int(p > t) for p in y_pred]
preds_bin = (
LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)[:, 1]
>= t
).astype(int)
threshold_score.append(
(t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
)
threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]
print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")
# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score
criterion = f1_score
threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
# preds_bin = [int(p > t) for p in y_pred]
preds_bin = (
LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)[:, 1]
>= t
).astype(int)
threshold_score.append(
(t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
)
threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]
print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")
The best threshold 0.2 , with score: 1.0
In [13]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
In [14]:
Copied!
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0
In [16]:
Copied!
# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")
# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))
# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()
# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)
# over/under sample for classification tasks
aml.balance_classes = True
# Run it
_ = aml.train(
x=list(cont_cols + cat_cols),
y=target_col,
training_frame=h2o_train,
leaderboard_frame=h2o_valid,
)
m = aml.get_best_model()
# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)
# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")
# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")
# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))
# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()
# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)
# over/under sample for classification tasks
aml.balance_classes = True
# Run it
_ = aml.train(
x=list(cont_cols + cat_cols),
y=target_col,
training_frame=h2o_train,
leaderboard_frame=h2o_valid,
)
m = aml.get_best_model()
# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)
# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")
Checking whether there is an H2O instance running at http://localhost:54321..... not found. Attempting to start a local H2O server... Java Version: openjdk version "17.0.10" 2024-01-16; OpenJDK Runtime Environment (build 17.0.10+7-Ubuntu-122.04.1); OpenJDK 64-Bit Server VM (build 17.0.10+7-Ubuntu-122.04.1, mixed mode, sharing) Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar Ice root: /tmp/tmp0jm6weez JVM stdout: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.out JVM stderr: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.err Server is running at http://127.0.0.1:54321 Connecting to H2O server at http://127.0.0.1:54321 ... successful.
| H2O_cluster_uptime: | 01 secs |
| H2O_cluster_timezone: | Etc/UTC |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.2 |
| H2O_cluster_version_age: | 8 days |
| H2O_cluster_name: | H2O_from_python_unknownUser_maoqtf |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 16 Gb |
| H2O_cluster_total_cores: | 12 |
| H2O_cluster_allowed_cores: | 12 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://127.0.0.1:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.10.11 final |
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
13:16:41.801: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:16:44.939: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:16:46.154: _train param, Dropping bad and constant columns: [barometric_pressure]
█████
13:16:59.224: _train param, Dropping unused columns: [barometric_pressure]
13:16:59.913: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:17:03.240: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:17:05.128: _train param, Dropping bad and constant columns: [barometric_pressure]
███
13:17:16.407: _train param, Dropping bad and constant columns: [barometric_pressure]
████
13:17:27.855: _train param, Dropping bad and constant columns: [barometric_pressure]
█████
13:17:40.196: _train param, Dropping unused columns: [barometric_pressure]
13:17:40.764: _train param, Dropping unused columns: [barometric_pressure]
13:17:41.338: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:17:44.575: _train param, Dropping bad and constant columns: [barometric_pressure]
13:17:45.413: _train param, Dropping bad and constant columns: [barometric_pressure]
███
13:17:53.937: _train param, Dropping bad and constant columns: [barometric_pressure]
█
13:17:55.712: _train param, Dropping unused columns: [barometric_pressure]
13:17:56.367: _train param, Dropping unused columns: [barometric_pressure]
██████████████████████████████████
13:19:35.448: _train param, Dropping unused columns: [barometric_pressure]
█
13:19:36.22: _train param, Dropping unused columns: [barometric_pressure]
██| (done) 100%
model_id auc logloss aucpr mean_per_class_error rmse mse training_time_ms predict_time_per_row_ms algo
DeepLearning_grid_1_AutoML_1_20240522_131641_model_8 1 0.0177159 1 0 0.0582198 0.00338955 628 0.127786 DeepLearning
DeepLearning_grid_1_AutoML_1_20240522_131641_model_4 1 0.0155053 1 0 0.0481989 0.00232314 117 0.1101 DeepLearning
XGBoost_1_AutoML_1_20240522_131641 1 0.0347672 1 0 0.0789026 0.00622562 464 0.078661 XGBoost
XGBoost_grid_1_AutoML_1_20240522_131641_model_12 1 0.0651884 1 0 0.106454 0.0113325 328 0.072441 XGBoost
DeepLearning_grid_1_AutoML_1_20240522_131641_model_1 1 0.0235103 1 0 0.0600679 0.00360815 99 0.098096 DeepLearning
XGBoost_grid_1_AutoML_1_20240522_131641_model_11 1 0.00556141 1 0 0.0320985 0.00103031 996 0.105708 XGBoost
GBM_4_AutoML_1_20240522_131641 1 9.26741e-06 1 0 9.39779e-05 8.83184e-09 4293 0.280377 GBM
XGBoost_grid_1_AutoML_1_20240522_131641_model_6 1 0.0141646 1 0 0.0244465 0.000597631 630 0.057813 XGBoost
XGBoost_grid_1_AutoML_1_20240522_131641_model_13 1 0.0318374 1 0 0.0810283 0.00656559 745 0.069021 XGBoost
GBM_3_AutoML_1_20240522_131641 1 2.55592e-05 1 0 0.000259039 6.7101e-08 4037 0.153118 GBM
[47 rows x 10 columns]
ModelMetricsBinomial: deeplearning
** Reported on test data. **
MSE: 0.0033895466349161047
RMSE: 0.05821981307867713
LogLoss: 0.017715874610611436
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9979014979275557
0 1 Error Rate
----- --- --- ------- -----------
0 93 0 0 (0.0/93.0)
1 0 10 0 (0.0/10.0)
Total 93 10 0 (0.0/103.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
--------------------------- ----------- ------- -----
max f1 0.997901 1 9
max f2 0.997901 1 9
max f0point5 0.997901 1 9
max accuracy 0.997901 1 9
max precision 0.999961 1 0
max recall 0.997901 1 9
max specificity 0.999961 1 0
max absolute_mcc 0.997901 1 9
max min_per_class_accuracy 0.997901 1 9
max mean_per_class_accuracy 0.997901 1 9
max tns 0.999961 93 0
max fns 0.999961 9 0
max fps 3.49717e-09 93 102
max tps 0.997901 10 9
max tnr 0.999961 1 0
max fnr 0.999961 0.9 0
max fpr 3.49717e-09 1 102
max tpr 0.997901 1 9
Gains/Lift Table: Avg response rate: 9.71 %, avg score: 11.26 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
------- -------------------------- ----------------- ------ ----------------- --------------- ----------- -------------------------- ------------------ -------------- ------------------------- ------ ----------------- --------------------
1 0.0194175 0.999958 10.3 10.3 1 0.999959 1 0.999959 0.2 0.2 930 930 0.2
2 0.0291262 0.999947 10.3 10.3 1 0.999947 1 0.999955 0.1 0.3 930 930 0.3
3 0.038835 0.999937 10.3 10.3 1 0.999938 1 0.999951 0.1 0.4 930 930 0.4
4 0.0485437 0.999927 10.3 10.3 1 0.99993 1 0.999947 0.1 0.5 930 930 0.5
5 0.0582524 0.999888 10.3 10.3 1 0.999889 1 0.999937 0.1 0.6 930 930 0.6
6 0.106796 0.372647 8.24 9.36364 0.8 0.879862 0.909091 0.945358 0.4 1 724 836.364 0.989247
7 0.15534 0.10825 0 6.4375 0 0.168191 0.625 0.702493 0 1 -100 543.75 0.935484
8 0.203883 0.00316095 0 4.90476 0 0.0682301 0.47619 0.551478 0 1 -100 390.476 0.88172
9 0.300971 0.000343083 0 3.32258 0 0.00100323 0.322581 0.373906 0 1 -100 232.258 0.774194
10 0.398058 0.000119786 0 2.5122 0 0.000202804 0.243902 0.282759 0 1 -100 151.22 0.666667
11 0.504854 8.52118e-05 0 1.98077 0 0.000102754 0.192308 0.222966 0 1 -100 98.0769 0.548387
12 0.601942 6.24023e-05 0 1.66129 0 7.12997e-05 0.16129 0.187015 0 1 -100 66.129 0.44086
13 0.699029 4.02512e-05 0 1.43056 0 5.12346e-05 0.138889 0.161048 0 1 -100 43.0556 0.333333
14 0.796117 2.31994e-05 0 1.2561 0 3.1514e-05 0.121951 0.141412 0 1 -100 25.6098 0.225806
15 0.893204 1.36306e-05 0 1.11957 0 1.91168e-05 0.108696 0.126043 0 1 -100 11.9565 0.11828
16 1 3.49717e-09 0 1 0 5.41845e-06 0.0970874 0.112583 0 1 -100 0 0
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
In [17]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:
with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
pandas_df = h2o_df.as_data_frame()
warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
In [18]:
Copied!
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0
In [19]:
Copied!
fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2
for class_id in range(n_classes):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_score[:, class_id],
name=f"ROC curve for class {class_id}",
ax=ax,
# plot_chance_level=(class_id == 2),
)
fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2
for class_id in range(n_classes):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_score[:, class_id],
name=f"ROC curve for class {class_id}",
ax=ax,
# plot_chance_level=(class_id == 2),
)
In [20]:
Copied!
print(
"Classification report:\n{}".format(
classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
)
)
print(
"Classification report:\n{}".format(
classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
)
)
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 94
1 1.00 1.00 1.00 10
accuracy 1.00 104
macro avg 1.00 1.00 1.00 104
weighted avg 1.00 1.00 1.00 104
/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:
with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
pandas_df = h2o_df.as_data_frame()
warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
LightGBM¶
In [21]:
Copied!
config = {}
config["objective"] = "binary"
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols
label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target_col] + id_cols),
df_train_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
df_valid_scaled_enc[target_col],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target_col] + id_cols),
ftrain[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target_col] + id_cols),
df_test_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
config = {}
config["objective"] = "binary"
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols
label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target_col] + id_cols),
df_train_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
df_valid_scaled_enc[target_col],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target_col] + id_cols),
ftrain[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target_col] + id_cols),
df_test_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
In [22]:
Copied!
model = lgbm.train(
config,
lgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
feval=None,
)
model = lgbm.train(
config,
lgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
feval=None,
)
[LightGBM] [Info] Number of positive: 81, number of negative: 747 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002491 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 467 [LightGBM] [Info] Number of data points in the train set: 828, number of used features: 9 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097826 -> initscore=-2.221616 [LightGBM] [Info] Start training from score -2.221616 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
In [23]:
Copied!
res = predict_cls_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data, raw_score=True),
task="binary",
)
result = pd.DataFrame(
{
"predicted": res,
"ground_truth": df_test[target_col].values,
}
)
print(
"Classification report:\n{}".format(
classification_report(result["ground_truth"], result["predicted"])
)
)
res = predict_cls_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data, raw_score=True),
task="binary",
)
result = pd.DataFrame(
{
"predicted": res,
"ground_truth": df_test[target_col].values,
}
)
print(
"Classification report:\n{}".format(
classification_report(result["ground_truth"], result["predicted"])
)
)
Classification report:
precision recall f1-score support
0 1.00 1.00 1.00 94
1 1.00 1.00 1.00 10
accuracy 1.00 104
macro avg 1.00 1.00 1.00 104
weighted avg 1.00 1.00 1.00 104
In [24]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data),
task="binary",
binary2d=True,
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data),
task="binary",
binary2d=True,
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0