Models¶
Imports¶
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
In [2]:
Copied!
import dill
import numpy as np
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
predict_cls_lgbm_from_raw,
predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast
# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()
import dill
import numpy as np
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from inference_model.preprocessing.scaler import scaler_mapper
from inference_model.training.utils import (
predict_cls_lgbm_from_raw,
predict_proba_lgbm_from_raw,
)
from inference_model.preprocessing.label_encoder import LabelEncoder
import ast
# import tracemalloc
# import warnings
# from typing import Optional, Dict
# tracemalloc.start()
Dataset¶
In [3]:
Copied!
valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True
valid_size = 0.2
test_size = 0.5
random_state = 1
test_n_valid_combined = True
In [4]:
Copied!
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
Out[4]:
engine_load | engine_coolant_temp | engine_speed | vehicle_speed | intake_air_temp | maf | throttle_position | fuel_rg_pressure | barometric_pressure | control_voltage | class | car_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100.0 | 17.0 | 904.5 | 0.0 | 10.0 | 12.55 | 83.14 | 37270.0 | 101.0 | 0.06 | 1 | 123abc |
1 | 100.0 | 17.0 | 906.0 | 0.0 | 11.0 | 12.36 | 83.14 | 37800.0 | 101.0 | 14.56 | 1 | 123abc |
2 | 100.0 | 17.0 | 905.0 | 0.0 | 10.0 | 12.36 | 83.53 | 37800.0 | 101.0 | 14.68 | 1 | 123abc |
3 | 100.0 | 18.0 | 905.5 | 0.0 | 11.0 | 12.30 | 83.53 | 37800.0 | 101.0 | 14.72 | 1 | 123abc |
4 | 100.0 | 18.0 | 907.0 | 0.0 | 11.0 | 12.47 | 83.14 | 36740.0 | 101.0 | 14.72 | 1 | 123abc |
In [8]:
Copied!
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
check possible class imbalance
In [9]:
Copied!
df_pd[target_col].value_counts()
df_pd[target_col].value_counts()
Out[9]:
class 0 934 1 101 Name: count, dtype: int64
Preprocessing¶
- divide dataset into train, test, valid
- scale continuous columns by standard scaler(not needed for LightGBM but for other mdoels...)
In [10]:
Copied!
df_train, df_valid = train_test_split(
df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[target_col],
random_state=random_state,
)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
scaler_mapper_def = {
"cont_cols": StandardScaler,
"cat_cols": None,
"id_cols": None,
}
scaler = scaler_mapper(
cont_cols=cont_cols,
cat_cols=cat_cols,
id_cols=[target_col] + id_cols,
scaler_mapper_def=scaler_mapper_def,
)
df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)
df_train, df_valid = train_test_split(
df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid[target_col],
random_state=random_state,
)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
scaler_mapper_def = {
"cont_cols": StandardScaler,
"cat_cols": None,
"id_cols": None,
}
scaler = scaler_mapper(
cont_cols=cont_cols,
cat_cols=cat_cols,
id_cols=[target_col] + id_cols,
scaler_mapper_def=scaler_mapper_def,
)
df_train_scaled = scaler.fit_transform(df_train)
df_test_scaled = scaler.transform(df_test)
df_valid_scaled = scaler.transform(df_valid)
Logistic Regression¶
In [11]:
Copied!
LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
print(
"LR classification report :\n"
+ str(classification_report(df_test_scaled[target_col], LR_predicted))
)
LR_clf = LogisticRegression(class_weight="balanced")
LR_clf.fit(
df_train_scaled.drop(columns=[target_col] + id_cols + cat_cols),
df_train_scaled[target_col],
)
LR_predicted = LR_clf.predict(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
print(
"LR classification report :\n"
+ str(classification_report(df_test_scaled[target_col], LR_predicted))
)
LR classification report : precision recall f1-score support 0 1.00 1.00 1.00 94 1 1.00 1.00 1.00 10 accuracy 1.00 104 macro avg 1.00 1.00 1.00 104 weighted avg 1.00 1.00 1.00 104
In [12]:
Copied!
# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score
criterion = f1_score
threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
# preds_bin = [int(p > t) for p in y_pred]
preds_bin = (
LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)[:, 1]
>= t
).astype(int)
threshold_score.append(
(t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
)
threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]
print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")
# which metrics am I focusing on? it is more importan to precisely predict non spenders or spenders?
from sklearn.metrics import f1_score
criterion = f1_score
threshold_score = []
for t in np.arange(0.2, 0.8, 0.01):
# preds_bin = [int(p > t) for p in y_pred]
preds_bin = (
LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)[:, 1]
>= t
).astype(int)
threshold_score.append(
(t, criterion(df_test_scaled[target_col], preds_bin, average="weighted"))
)
threshold_score = sorted(threshold_score, key=lambda x: x[1], reverse=True)
best_threshold, best_score = threshold_score[0][0], threshold_score[0][1]
print(f"The best threshold\n{best_threshold}\n, with score:\n{best_score}")
The best threshold 0.2 , with score: 1.0
In [13]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = LR_clf.predict_proba(
df_test_scaled.drop(columns=[target_col] + id_cols + cat_cols)
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
In [14]:
Copied!
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0
In [16]:
Copied!
# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")
# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))
# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()
# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)
# over/under sample for classification tasks
aml.balance_classes = True
# Run it
_ = aml.train(
x=list(cont_cols + cat_cols),
y=target_col,
training_frame=h2o_train,
leaderboard_frame=h2o_valid,
)
m = aml.get_best_model()
# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)
# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")
# initialize H2O
h2o.init(log_dir="h2o_logs", log_level="WARN")
# read as h2o file
h2o_train = h2o.H2OFrame(df_train_scaled.drop(columns=id_cols))
h2o_valid = h2o.H2OFrame(df_valid_scaled.drop(columns=id_cols))
h2o_test = h2o.H2OFrame(df_test_scaled.drop(columns=id_cols))
# For binary classification, response should be a factor
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_valid[target_col] = h2o_valid[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()
# Define AML task
aml = H2OAutoML(seed=random_state, max_runtime_secs=180)
# over/under sample for classification tasks
aml.balance_classes = True
# Run it
_ = aml.train(
x=list(cont_cols + cat_cols),
y=target_col,
training_frame=h2o_train,
leaderboard_frame=h2o_valid,
)
m = aml.get_best_model()
# Leaderboard, show and save
lb = h2o.automl.get_leaderboard(aml, extra_columns="ALL")
print(lb)
print(m.model_performance(h2o_valid))
predictions = m.predict(h2o_test)
# save results and model
# h2o.export_file(lb, path="h2o_logs/leaderboard.csv", force=True)
# MOJO is h2o version agnostic
# m.save_mojo("h2o_logs/bestmodel.zip")
Checking whether there is an H2O instance running at http://localhost:54321..... not found. Attempting to start a local H2O server... Java Version: openjdk version "17.0.10" 2024-01-16; OpenJDK Runtime Environment (build 17.0.10+7-Ubuntu-122.04.1); OpenJDK 64-Bit Server VM (build 17.0.10+7-Ubuntu-122.04.1, mixed mode, sharing) Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar Ice root: /tmp/tmp0jm6weez JVM stdout: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.out JVM stderr: /tmp/tmp0jm6weez/h2o_unknownUser_started_from_python.err Server is running at http://127.0.0.1:54321 Connecting to H2O server at http://127.0.0.1:54321 ... successful.
H2O_cluster_uptime: | 01 secs |
H2O_cluster_timezone: | Etc/UTC |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.46.0.2 |
H2O_cluster_version_age: | 8 days |
H2O_cluster_name: | H2O_from_python_unknownUser_maoqtf |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 16 Gb |
H2O_cluster_total_cores: | 12 |
H2O_cluster_allowed_cores: | 12 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://127.0.0.1:54321 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
Python_version: | 3.10.11 final |
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100% Parse progress: |████████████████████████████████████████████████████████████████| (done) 100% Parse progress: |████████████████████████████████████████████████████████████████| (done) 100% AutoML progress: | 13:16:41.801: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:16:44.939: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:16:46.154: _train param, Dropping bad and constant columns: [barometric_pressure] █████ 13:16:59.224: _train param, Dropping unused columns: [barometric_pressure] 13:16:59.913: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:17:03.240: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:17:05.128: _train param, Dropping bad and constant columns: [barometric_pressure] ███ 13:17:16.407: _train param, Dropping bad and constant columns: [barometric_pressure] ████ 13:17:27.855: _train param, Dropping bad and constant columns: [barometric_pressure] █████ 13:17:40.196: _train param, Dropping unused columns: [barometric_pressure] 13:17:40.764: _train param, Dropping unused columns: [barometric_pressure] 13:17:41.338: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:17:44.575: _train param, Dropping bad and constant columns: [barometric_pressure] 13:17:45.413: _train param, Dropping bad and constant columns: [barometric_pressure] ███ 13:17:53.937: _train param, Dropping bad and constant columns: [barometric_pressure] █ 13:17:55.712: _train param, Dropping unused columns: [barometric_pressure] 13:17:56.367: _train param, Dropping unused columns: [barometric_pressure] ██████████████████████████████████ 13:19:35.448: _train param, Dropping unused columns: [barometric_pressure] █ 13:19:36.22: _train param, Dropping unused columns: [barometric_pressure] ██| (done) 100% model_id auc logloss aucpr mean_per_class_error rmse mse training_time_ms predict_time_per_row_ms algo DeepLearning_grid_1_AutoML_1_20240522_131641_model_8 1 0.0177159 1 0 0.0582198 0.00338955 628 0.127786 DeepLearning DeepLearning_grid_1_AutoML_1_20240522_131641_model_4 1 0.0155053 1 0 0.0481989 0.00232314 117 0.1101 DeepLearning XGBoost_1_AutoML_1_20240522_131641 1 0.0347672 1 0 0.0789026 0.00622562 464 0.078661 XGBoost XGBoost_grid_1_AutoML_1_20240522_131641_model_12 1 0.0651884 1 0 0.106454 0.0113325 328 0.072441 XGBoost DeepLearning_grid_1_AutoML_1_20240522_131641_model_1 1 0.0235103 1 0 0.0600679 0.00360815 99 0.098096 DeepLearning XGBoost_grid_1_AutoML_1_20240522_131641_model_11 1 0.00556141 1 0 0.0320985 0.00103031 996 0.105708 XGBoost GBM_4_AutoML_1_20240522_131641 1 9.26741e-06 1 0 9.39779e-05 8.83184e-09 4293 0.280377 GBM XGBoost_grid_1_AutoML_1_20240522_131641_model_6 1 0.0141646 1 0 0.0244465 0.000597631 630 0.057813 XGBoost XGBoost_grid_1_AutoML_1_20240522_131641_model_13 1 0.0318374 1 0 0.0810283 0.00656559 745 0.069021 XGBoost GBM_3_AutoML_1_20240522_131641 1 2.55592e-05 1 0 0.000259039 6.7101e-08 4037 0.153118 GBM [47 rows x 10 columns] ModelMetricsBinomial: deeplearning ** Reported on test data. ** MSE: 0.0033895466349161047 RMSE: 0.05821981307867713 LogLoss: 0.017715874610611436 Mean Per-Class Error: 0.0 AUC: 1.0 AUCPR: 1.0 Gini: 1.0 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9979014979275557 0 1 Error Rate ----- --- --- ------- ----------- 0 93 0 0 (0.0/93.0) 1 0 10 0 (0.0/10.0) Total 93 10 0 (0.0/103.0) Maximum Metrics: Maximum metrics at their respective thresholds metric threshold value idx --------------------------- ----------- ------- ----- max f1 0.997901 1 9 max f2 0.997901 1 9 max f0point5 0.997901 1 9 max accuracy 0.997901 1 9 max precision 0.999961 1 0 max recall 0.997901 1 9 max specificity 0.999961 1 0 max absolute_mcc 0.997901 1 9 max min_per_class_accuracy 0.997901 1 9 max mean_per_class_accuracy 0.997901 1 9 max tns 0.999961 93 0 max fns 0.999961 9 0 max fps 3.49717e-09 93 102 max tps 0.997901 10 9 max tnr 0.999961 1 0 max fnr 0.999961 0.9 0 max fpr 3.49717e-09 1 102 max tpr 0.997901 1 9 Gains/Lift Table: Avg response rate: 9.71 %, avg score: 11.26 % group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov ------- -------------------------- ----------------- ------ ----------------- --------------- ----------- -------------------------- ------------------ -------------- ------------------------- ------ ----------------- -------------------- 1 0.0194175 0.999958 10.3 10.3 1 0.999959 1 0.999959 0.2 0.2 930 930 0.2 2 0.0291262 0.999947 10.3 10.3 1 0.999947 1 0.999955 0.1 0.3 930 930 0.3 3 0.038835 0.999937 10.3 10.3 1 0.999938 1 0.999951 0.1 0.4 930 930 0.4 4 0.0485437 0.999927 10.3 10.3 1 0.99993 1 0.999947 0.1 0.5 930 930 0.5 5 0.0582524 0.999888 10.3 10.3 1 0.999889 1 0.999937 0.1 0.6 930 930 0.6 6 0.106796 0.372647 8.24 9.36364 0.8 0.879862 0.909091 0.945358 0.4 1 724 836.364 0.989247 7 0.15534 0.10825 0 6.4375 0 0.168191 0.625 0.702493 0 1 -100 543.75 0.935484 8 0.203883 0.00316095 0 4.90476 0 0.0682301 0.47619 0.551478 0 1 -100 390.476 0.88172 9 0.300971 0.000343083 0 3.32258 0 0.00100323 0.322581 0.373906 0 1 -100 232.258 0.774194 10 0.398058 0.000119786 0 2.5122 0 0.000202804 0.243902 0.282759 0 1 -100 151.22 0.666667 11 0.504854 8.52118e-05 0 1.98077 0 0.000102754 0.192308 0.222966 0 1 -100 98.0769 0.548387 12 0.601942 6.24023e-05 0 1.66129 0 7.12997e-05 0.16129 0.187015 0 1 -100 66.129 0.44086 13 0.699029 4.02512e-05 0 1.43056 0 5.12346e-05 0.138889 0.161048 0 1 -100 43.0556 0.333333 14 0.796117 2.31994e-05 0 1.2561 0 3.1514e-05 0.121951 0.141412 0 1 -100 25.6098 0.225806 15 0.893204 1.36306e-05 0 1.11957 0 1.91168e-05 0.108696 0.126043 0 1 -100 11.9565 0.11828 16 1 3.49717e-09 0 1 0 5.41845e-06 0.0970874 0.112583 0 1 -100 0 0 deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
In [17]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = h2o.as_list(predictions.drop(["predict"])).values
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using: with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
In [18]:
Copied!
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0
In [19]:
Copied!
fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2
for class_id in range(n_classes):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_score[:, class_id],
name=f"ROC curve for class {class_id}",
ax=ax,
# plot_chance_level=(class_id == 2),
)
fig, ax = plt.subplots(figsize=(6, 6))
n_classes = 2
for class_id in range(n_classes):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_score[:, class_id],
name=f"ROC curve for class {class_id}",
ax=ax,
# plot_chance_level=(class_id == 2),
)
In [20]:
Copied!
print(
"Classification report:\n{}".format(
classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
)
)
print(
"Classification report:\n{}".format(
classification_report(df_test[target_col], h2o.as_list(predictions["predict"]))
)
)
Classification report: precision recall f1-score support 0 1.00 1.00 1.00 94 1 1.00 1.00 1.00 10 accuracy 1.00 104 macro avg 1.00 1.00 1.00 104 weighted avg 1.00 1.00 1.00 104
/opt/conda/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using: with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
LightGBM¶
In [21]:
Copied!
config = {}
config["objective"] = "binary"
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols
label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target_col] + id_cols),
df_train_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
df_valid_scaled_enc[target_col],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target_col] + id_cols),
ftrain[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target_col] + id_cols),
df_test_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
config = {}
config["objective"] = "binary"
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
lgb_cat_cols = cat_cols
label_encoder = LabelEncoder(lgb_cat_cols)
df_train_scaled_enc = label_encoder.fit_transform(df_train_scaled_enc)
df_valid_scaled_enc = label_encoder.transform(df_valid_scaled_enc)
df_test_scaled_enc = label_encoder.transform(df_test_scaled_enc)
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target_col] + id_cols),
df_train_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target_col] + id_cols),
df_valid_scaled_enc[target_col],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(drop=True)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target_col] + id_cols),
ftrain[target_col],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target_col] + id_cols),
df_test_scaled_enc[target_col],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
In [22]:
Copied!
model = lgbm.train(
config,
lgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
feval=None,
)
model = lgbm.train(
config,
lgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
feval=None,
)
[LightGBM] [Info] Number of positive: 81, number of negative: 747 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002491 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 467 [LightGBM] [Info] Number of data points in the train set: 828, number of used features: 9 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097826 -> initscore=-2.221616 [LightGBM] [Info] Start training from score -2.221616 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
In [23]:
Copied!
res = predict_cls_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data, raw_score=True),
task="binary",
)
result = pd.DataFrame(
{
"predicted": res,
"ground_truth": df_test[target_col].values,
}
)
print(
"Classification report:\n{}".format(
classification_report(result["ground_truth"], result["predicted"])
)
)
res = predict_cls_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data, raw_score=True),
task="binary",
)
result = pd.DataFrame(
{
"predicted": res,
"ground_truth": df_test[target_col].values,
}
)
print(
"Classification report:\n{}".format(
classification_report(result["ground_truth"], result["predicted"])
)
)
Classification report: precision recall f1-score support 0 1.00 1.00 1.00 94 1 1.00 1.00 1.00 10 accuracy 1.00 104 macro avg 1.00 1.00 1.00 104 weighted avg 1.00 1.00 1.00 104
In [24]:
Copied!
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data),
task="binary",
binary2d=True,
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
y_onehot_test = pd.get_dummies(df_test_scaled[target_col]).values
y_score = predict_proba_lgbm_from_raw(
preds_raw=model.predict(lgbtest.data),
task="binary",
binary2d=True,
)
micro_roc_auc_per_class = roc_auc_score(
y_onehot_test,
y_score,
average=None,
)
micro_roc_auc_weighted = roc_auc_score(
y_onehot_test,
y_score,
average="weighted",
)
print(f"Per class AUC:\n{micro_roc_auc_per_class}")
print(f"Weighted AUC:\n{micro_roc_auc_weighted}")
Per class AUC: [1. 1.] Weighted AUC: 1.0