Feature analysis¶
Purpose:
- analyse dataset features without inclusion of target value
- if it looks healthy
- duplicate values
- NA values
- zero/0 values
- if the values are "too constant", ie. most [robably bad features for classifcation/regression
- small std values
- entropy is too high/too low(not sure right now?) - "Entropy is a measure of disorder or uncertainty"
- number of unique values in categorical features is too high - most probably they are unable to identify any group of targets
- drop highly correlated values
- to improve performance of the model training
- to decrease dataset size for further exploration
- if it looks healthy
Imports¶
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
# import plotly.graph_objects as go
from inference_model.eda.features.analysis import (
missing,
zero,
nunique,
init_check,
std,
entropy,
nunique,
)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from inference_model.eda.features.plotting import cross_correlation, distributions
import ast
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../..")
import pandas as pd
# import plotly.graph_objects as go
from inference_model.eda.features.analysis import (
missing,
zero,
nunique,
init_check,
std,
entropy,
nunique,
)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from inference_model.eda.features.plotting import cross_correlation, distributions
import ast
Dataset¶
In [2]:
Copied!
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
data = ast.literal_eval(f.read())
df_pd = pd.DataFrame()
for data_value in data:
temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
by="ts_millis:", ascending=True
)["value"]
temp_df.rename(list(data_value)[0], inplace=True)
df_pd = pd.concat([df_pd, temp_df], axis=1)
df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100
df_pd.head()
Out[2]:
engine_load | engine_coolant_temp | engine_speed | vehicle_speed | intake_air_temp | maf | throttle_position | fuel_rg_pressure | barometric_pressure | control_voltage | class | car_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100.0 | 17.0 | 904.5 | 0.0 | 10.0 | 12.55 | 83.14 | 37270.0 | 101.0 | 0.06 | 1 | 123abc |
1 | 100.0 | 17.0 | 906.0 | 0.0 | 11.0 | 12.36 | 83.14 | 37800.0 | 101.0 | 14.56 | 1 | 123abc |
2 | 100.0 | 17.0 | 905.0 | 0.0 | 10.0 | 12.36 | 83.53 | 37800.0 | 101.0 | 14.68 | 1 | 123abc |
3 | 100.0 | 18.0 | 905.5 | 0.0 | 11.0 | 12.30 | 83.53 | 37800.0 | 101.0 | 14.72 | 1 | 123abc |
4 | 100.0 | 18.0 | 907.0 | 0.0 | 11.0 | 12.47 | 83.14 | 36740.0 | 101.0 | 14.72 | 1 | 123abc |
In [3]:
Copied!
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
cat_cols = None
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
cat_cols = None
Features¶
Analysis¶
In [4]:
Copied!
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
df=df_pd,
identifier=id_cols,
cat_cols=cat_cols,
cont_cols=cont_cols,
verbose=True,
)
# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
df=df_pd,
identifier=id_cols,
cat_cols=cat_cols,
cont_cols=cont_cols,
verbose=True,
)
# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
[CHECK] Number of duplicated ids: 1034 [CHECK] Numerical columns count mean std min 25% 50% \ maf 1035.0 11.00 3.40 0.03 9.77 10.19 vehicle_speed 1035.0 0.95 3.70 0.00 0.00 0.00 engine_load 1035.0 46.29 21.59 30.59 35.69 37.25 barometric_pressure 1035.0 101.00 0.00 101.00 101.00 101.00 intake_air_temp 1035.0 28.56 18.16 -6.00 20.00 26.00 throttle_position 1035.0 83.21 0.15 83.14 83.14 83.14 engine_coolant_temp 1035.0 76.66 21.89 17.00 74.50 88.00 engine_speed 1035.0 929.39 270.78 3.20 893.00 896.50 fuel_rg_pressure 1035.0 35967.15 14377.15 110.00 31220.00 34050.00 control_voltage 1035.0 14.32 2.14 0.05 14.76 14.78 75% max maf 11.99 31.83 vehicle_speed 0.00 20.00 engine_load 38.82 100.00 barometric_pressure 101.00 101.00 intake_air_temp 32.00 145.00 throttle_position 83.14 83.53 engine_coolant_temp 89.00 91.00 engine_speed 899.50 2282.50 fuel_rg_pressure 35390.00 108770.00 control_voltage 14.80 14.84
In [5]:
Copied!
missing_val_frac, fig = missing(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
missing_val_frac.head()
missing_val_frac, fig = missing(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
missing_val_frac.head()
Out[5]:
engine_load 0.0 engine_coolant_temp 0.0 engine_speed 0.0 vehicle_speed 0.0 intake_air_temp 0.0 dtype: float64
In [6]:
Copied!
zero_val_frac, fig = zero(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
zero_val_frac.head()
zero_val_frac, fig = zero(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
zero_val_frac.head()
Out[6]:
vehicle_speed 0.927536 class 0.902415 engine_load 0.000000 engine_coolant_temp 0.000000 engine_speed 0.000000 dtype: float64
In [7]:
Copied!
data_nunique, fig = nunique(
df=df_pd.drop(columns=id_cols),
scale="log",
plot=True,
)
data_nunique.head()
data_nunique, fig = nunique(
df=df_pd.drop(columns=id_cols),
scale="log",
plot=True,
)
data_nunique.head()
Out[7]:
barometric_pressure 1 throttle_position 2 class 2 vehicle_speed 21 control_voltage 48 dtype: int64
In [8]:
Copied!
data_std, fig = std(
df=df_pd[cont_cols],
scale="log",
plot=True,
)
data_std.head()
data_std, fig = std(
df=df_pd[cont_cols],
scale="log",
plot=True,
)
data_std.head()
Out[8]:
fuel_rg_pressure 14377.152211 engine_speed 270.781548 engine_coolant_temp 21.888726 engine_load 21.587427 intake_air_temp 18.163539 dtype: float64
In [9]:
Copied!
col_entropies, fig = entropy(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
col_entropies.head()
col_entropies, fig = entropy(
df=df_pd.drop(columns=id_cols),
scale="linear",
plot=True,
)
col_entropies.head()
Out[9]:
maf 4.800777 engine_speed 3.981783 fuel_rg_pressure 3.797527 intake_air_temp 3.612889 engine_load 3.253640 dtype: float64
Plotting¶
In [10]:
Copied!
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
Top 10 absolute correlations vehicle_speed fuel_rg_pressure 0.648543 maf 0.544904 engine_load engine_coolant_temp 0.531108 engine_speed vehicle_speed 0.525832 engine_coolant_temp throttle_position 0.491290 engine_speed maf 0.398281 maf fuel_rg_pressure 0.331396 engine_speed fuel_rg_pressure 0.329811 engine_coolant_temp intake_air_temp 0.305559 engine_load intake_air_temp 0.207530 dtype: float64
In [11]:
Copied!
fig = distributions(
df=df_pd[cont_cols],
low_per_cut=0,
high_per_cut=1,
type="box",
)
fig = distributions(
df=df_pd[cont_cols],
low_per_cut=0,
high_per_cut=1,
type="box",
)
/home/jovyan/success6g-edge/inference_model/eda/features/plotting.py:98: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. ax[i].set_xlim(data[col].quantile(low_per_cut), data[col].quantile(high_per_cut))