Feature analysis¶

Purpose:

analyse dataset features without inclusion of target value
- if it looks healthy
  - duplicate values
  - NA values
  - zero/0 values
- if the values are "too constant", ie. most [robably bad features for classifcation/regression
  - small std values
  - entropy is too high/too low(not sure right now?) - "Entropy is a measure of disorder or uncertainty"
  - number of unique values in categorical features is too high - most probably they are unable to identify any group of targets
- drop highly correlated values
  - to improve performance of the model training
  - to decrease dataset size for further exploration

Imports¶

In [1]:

Copied!





import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

# import plotly.graph_objects as go
from inference_model.eda.features.analysis import (
    missing,
    zero,
    nunique,
    init_check,
    std,
    entropy,
    nunique,
)

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from inference_model.eda.features.plotting import cross_correlation, distributions
import ast
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../..")

import pandas as pd

# import plotly.graph_objects as go
from inference_model.eda.features.analysis import (
    missing,
    zero,
    nunique,
    init_check,
    std,
    entropy,
    nunique,
)

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
from inference_model.eda.features.plotting import cross_correlation, distributions
import ast

Dataset¶

In [2]:

Copied!





# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
    data = ast.literal_eval(f.read())

df_pd = pd.DataFrame()
for data_value in data:
    temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
        by="ts_millis:", ascending=True
    )["value"]
    temp_df.rename(list(data_value)[0], inplace=True)
    df_pd = pd.concat([df_pd, temp_df], axis=1)

df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100

df_pd.head()
# 1. get/create some example data
with open("data/log_tiguan_27_mar_dac.txt") as f:
    data = ast.literal_eval(f.read())

df_pd = pd.DataFrame()
for data_value in data:
    temp_df = pd.DataFrame(data_value[list(data_value)[0]]).sort_values(
        by="ts_millis:", ascending=True
    )["value"]
    temp_df.rename(list(data_value)[0], inplace=True)
    df_pd = pd.concat([df_pd, temp_df], axis=1)

df_pd.dropna(inplace=True)
df_pd["class"] = 0
df_pd["car_id"] = "123abc"
df_pd.loc[:100, ["class"]] = 1
df_pd.loc[:100, ["engine_load"]] = 100

df_pd.head()

Out[2]:

	engine_load	engine_coolant_temp	engine_speed	intake_air_temp	maf	throttle_position	fuel_rg_pressure	barometric_pressure	control_voltage	class	car_id
0	100.0	17.0	904.5	10.0	12.55	83.14	37270.0	101.0	0.06	1	123abc
1	100.0	17.0	906.0	11.0	12.36	83.14	37800.0	101.0	14.56	1	123abc
2	100.0	17.0	905.0	10.0	12.36	83.53	37800.0	101.0	14.68	1	123abc
3	100.0	18.0	905.5	11.0	12.30	83.53	37800.0	101.0	14.72	1	123abc
4	100.0	18.0	907.0	11.0	12.47	83.14	36740.0	101.0	14.72	1	123abc

In [3]:

Copied!





target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
cat_cols = None
target_col = "class"
id_cols = ["car_id"]
cat_cols = []
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()
df_pd[cat_cols] = df_pd[cat_cols].astype(str)
cat_cols = None

Features¶

Analysis¶

In [4]:

Copied!





duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
    df=df_pd,
    identifier=id_cols,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
    verbose=True,
)

# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()
duplicated_ids, cont_cols_desc, cat_cols_desc = init_check(
    df=df_pd,
    identifier=id_cols,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
    verbose=True,
)

# duplicated_ids, cont_cols_desc.head(), cat_cols_desc.head()

[CHECK] Number of duplicated ids: 1034
[CHECK] Numerical columns
                      count      mean       std     min       25%       50%  \
maf                  1035.0     11.00      3.40    0.03      9.77     10.19   
vehicle_speed        1035.0      0.95      3.70    0.00      0.00      0.00   
engine_load          1035.0     46.29     21.59   30.59     35.69     37.25   
barometric_pressure  1035.0    101.00      0.00  101.00    101.00    101.00   
intake_air_temp      1035.0     28.56     18.16   -6.00     20.00     26.00   
throttle_position    1035.0     83.21      0.15   83.14     83.14     83.14   
engine_coolant_temp  1035.0     76.66     21.89   17.00     74.50     88.00   
engine_speed         1035.0    929.39    270.78    3.20    893.00    896.50   
fuel_rg_pressure     1035.0  35967.15  14377.15  110.00  31220.00  34050.00   
control_voltage      1035.0     14.32      2.14    0.05     14.76     14.78   

                          75%        max  
maf                     11.99      31.83  
vehicle_speed            0.00      20.00  
engine_load             38.82     100.00  
barometric_pressure    101.00     101.00  
intake_air_temp         32.00     145.00  
throttle_position       83.14      83.53  
engine_coolant_temp     89.00      91.00  
engine_speed           899.50    2282.50  
fuel_rg_pressure     35390.00  108770.00  
control_voltage         14.80      14.84

In [5]:

Copied!





missing_val_frac, fig = missing(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

missing_val_frac.head()
missing_val_frac, fig = missing(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

missing_val_frac.head()

Out[5]:

engine_load            0.0
engine_coolant_temp    0.0
engine_speed           0.0
vehicle_speed          0.0
intake_air_temp        0.0
dtype: float64

No description has been provided for this image

In [6]:

Copied!





zero_val_frac, fig = zero(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

zero_val_frac.head()
zero_val_frac, fig = zero(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

zero_val_frac.head()

Out[6]:

vehicle_speed          0.927536
class                  0.902415
engine_load            0.000000
engine_coolant_temp    0.000000
engine_speed           0.000000
dtype: float64

In [7]:

Copied!





data_nunique, fig = nunique(
    df=df_pd.drop(columns=id_cols),
    scale="log",
    plot=True,
)

data_nunique.head()
data_nunique, fig = nunique(
    df=df_pd.drop(columns=id_cols),
    scale="log",
    plot=True,
)

data_nunique.head()

Out[7]:

barometric_pressure     1
throttle_position       2
class                   2
vehicle_speed          21
control_voltage        48
dtype: int64

In [8]:

Copied!





data_std, fig = std(
    df=df_pd[cont_cols],
    scale="log",
    plot=True,
)

data_std.head()
data_std, fig = std(
    df=df_pd[cont_cols],
    scale="log",
    plot=True,
)

data_std.head()

Out[8]:

fuel_rg_pressure       14377.152211
engine_speed             270.781548
engine_coolant_temp       21.888726
engine_load               21.587427
intake_air_temp           18.163539
dtype: float64

In [9]:

Copied!





col_entropies, fig = entropy(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

col_entropies.head()
col_entropies, fig = entropy(
    df=df_pd.drop(columns=id_cols),
    scale="linear",
    plot=True,
)

col_entropies.head()

Out[9]:

maf                 4.800777
engine_speed        3.981783
fuel_rg_pressure    3.797527
intake_air_temp     3.612889
engine_load         3.253640
dtype: float64

Plotting¶

In [10]:

Copied!

fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)
fig = cross_correlation(df=df_pd[cont_cols], n=10, verbose=True)

Top 10 absolute correlations
vehicle_speed        fuel_rg_pressure       0.648543
                     maf                    0.544904
engine_load          engine_coolant_temp    0.531108
engine_speed         vehicle_speed          0.525832
engine_coolant_temp  throttle_position      0.491290
engine_speed         maf                    0.398281
maf                  fuel_rg_pressure       0.331396
engine_speed         fuel_rg_pressure       0.329811
engine_coolant_temp  intake_air_temp        0.305559
engine_load          intake_air_temp        0.207530
dtype: float64

In [11]:

Copied!





fig = distributions(
    df=df_pd[cont_cols],
    low_per_cut=0,
    high_per_cut=1,
    type="box",
)
fig = distributions(
    df=df_pd[cont_cols],
    low_per_cut=0,
    high_per_cut=1,
    type="box",
)

/home/jovyan/success6g-edge/inference_model/eda/features/plotting.py:98: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding.
  ax[i].set_xlim(data[col].quantile(low_per_cut), data[col].quantile(high_per_cut))