THIS IS A WORK IN PROGRESS¶
Surname origin country classification¶
- approach shamelessly copied from https://www.kaggle.com/code/yonatankpl/surname-classification-with-bert
- NOTE from author:
Based on the discussion [here](https://www.kaggle.com/competitions/playground-series-s4e1/discussion/465517), I created a Bert based Surname classifier.
- possible improvement of the base dataset by scraping wiki: https://github.com/greenelab/wiki-nationality-estimate
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
import torch
from src import surname_classification
from src import utils
from src.label_encoder import LabelEncoder
from torch.utils.data import DataLoader
import pickle
from transformers import (
BertForSequenceClassification,
get_linear_schedule_with_warmup,
BertTokenizer,
)
from itertools import islice
from sklearn.model_selection import train_test_split
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
import torch
from src import surname_classification
from src import utils
from src.label_encoder import LabelEncoder
from torch.utils.data import DataLoader
import pickle
from transformers import (
BertForSequenceClassification,
get_linear_schedule_with_warmup,
BertTokenizer,
)
from itertools import islice
from sklearn.model_selection import train_test_split
In [3]:
Copied!
# surname_data = pd.read_csv("data/raw/surname-nationality.csv")
# splitted_data = pd.read_csv("data/raw/surnames_with_splits.csv")
# surname_data = pd.read_csv("data/raw/surname-nationality.csv")
# splitted_data = pd.read_csv("data/raw/surnames_with_splits.csv")
In [2]:
Copied!
df = pd.read_parquet("data/preprocessed/final_dataset.parquet")
df = pd.read_parquet("data/preprocessed/final_dataset.parquet")
In [3]:
Copied!
df.head()
df.head()
Out[3]:
surname | country | |
---|---|---|
0 | Mengel | Estonia |
1 | Saaremae | Estonia |
2 | Rikkiev | Estonia |
3 | Est | Estonia |
4 | Villandi | Estonia |
In [4]:
Copied!
label_encoder = LabelEncoder(columns_to_encode=["country"])
df = label_encoder.fit_transform(df)
# df = self.label_encoder.transform(df)
label_encoder = LabelEncoder(columns_to_encode=["country"])
df = label_encoder.fit_transform(df)
# df = self.label_encoder.transform(df)
In [13]:
Copied!
list(islice(label_encoder.encoding_dict["country"].items(), 5))
list(islice(label_encoder.encoding_dict["country"].items(), 5))
Out[13]:
[('Estonia', 1), ('Bolivia', 2), ('Tunisia', 3), ('England', 4), ('Russia', 5)]
In [6]:
Copied!
df.head()
df.head()
Out[6]:
surname | country | |
---|---|---|
0 | Mengel | 1 |
1 | Saaremae | 1 |
2 | Rikkiev | 1 |
3 | Est | 1 |
4 | Villandi | 1 |
In [7]:
Copied!
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
df, test_size=valid_size, stratify=df["country"], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid["country"],
random_state=random_state,
)
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
df, test_size=valid_size, stratify=df["country"], random_state=random_state
)
df_valid, df_test = train_test_split(
df_valid,
test_size=test_size,
stratify=df_valid["country"],
random_state=random_state,
)
In [ ]:
Copied!
# Tokenize each dataset
# train_encodings = surname_classification.tokenize_data(df_train, surname_col="surname")
# val_encodings = surname_classification.tokenize_data(df_val, surname_col="surname")
test_encodings = surname_classification.tokenize_data(df_test, surname_col="surname")
# Tokenize each dataset
# train_encodings = surname_classification.tokenize_data(df_train, surname_col="surname")
# val_encodings = surname_classification.tokenize_data(df_val, surname_col="surname")
test_encodings = surname_classification.tokenize_data(df_test, surname_col="surname")
In [8]:
Copied!
batch_size = 32
dataset_train = surname_classification.create_dataset(
train_encodings, df_train, labels="country"
)
dataset_val = surname_classification.create_dataset(
val_encodings, df_val, labels="country"
)
dataset_test = surname_classification.create_dataset(
test_encodings, df_test, labels="country"
)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)
batch_size = 32
dataset_train = surname_classification.create_dataset(
train_encodings, df_train, labels="country"
)
dataset_val = surname_classification.create_dataset(
val_encodings, df_val, labels="country"
)
dataset_test = surname_classification.create_dataset(
test_encodings, df_test, labels="country"
)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)
In [9]:
Copied!
num_labels = df["country"].nunique()
num_epochs = 2
model = BertForSequenceClassification.from_pretrained(
"bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0.2, # Usually a fraction of total_steps
num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels = df["country"].nunique()
num_epochs = 2
model = BertForSequenceClassification.from_pretrained(
"bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0.2, # Usually a fraction of total_steps
num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. /mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn(
Out[9]:
BertForSequenceClassification( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(28996, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0-11): 12 x BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (dropout): Dropout(p=0.4, inplace=False) (classifier): Linear(in_features=768, out_features=18, bias=True) )
In [10]:
Copied!
trained_model = surname_classification.train_model(
model,
train_dataloader,
val_dataloader,
optimizer,
scheduler,
device,
num_epochs=num_epochs,
)
trained_model = surname_classification.train_model(
model,
train_dataloader,
val_dataloader,
optimizer,
scheduler,
device,
num_epochs=num_epochs,
)
log from running a script on a server
(ecovadis) pmulinka@supercom-wssonata:~/assignment/ecovadis_assignment$ python notebooks/eda/surname_classification_with_bert.py
/home/pmulinka/assignment/ecovadis_assignment
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/pmulinka/assignment/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Epoch 1/10
Training Loss: 2.394
Validation Loss: 2.256, Validation Accuracy: 0.267
Epoch 2/10
Training Loss: 2.267
Validation Loss: 2.251, Validation Accuracy: 0.267
Epoch 3/10
Training Loss: 2.265
Validation Loss: 2.243, Validation Accuracy: 0.306
Epoch 4/10
Training Loss: 2.250
Validation Loss: 2.237, Validation Accuracy: 0.385
Epoch 5/10
Training Loss: 2.251
Validation Loss: 2.227, Validation Accuracy: 0.317
Epoch 6/10
Training Loss: 1.620
Validation Loss: 0.982, Validation Accuracy: 0.735
Epoch 7/10
Training Loss: 1.062
Validation Loss: 0.826, Validation Accuracy: 0.772
Epoch 8/10
Training Loss: 0.911
Validation Loss: 0.799, Validation Accuracy: 0.785
Epoch 9/10
Training Loss: 0.832
Validation Loss: 0.775, Validation Accuracy: 0.791
Epoch 10/10
Training Loss: 0.780
Validation Loss: 0.768, Validation Accuracy: 0.790
Training complete
Evaluation on test dataset yield
Loss: 0.7684005849206677
Accuracy: 0.7960164835164836
In [ ]:
Copied!
loss, accuracy = surname_classification.evaluate_model(
trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
loss, accuracy = surname_classification.evaluate_model(
trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
In [7]:
Copied!
utils.dill_dump(
file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
utils.dill_dump(
file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
In [ ]:
Copied!
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
In [10]:
Copied!
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
In [15]:
Copied!
predicted_surnames
predicted_surnames
Out[15]:
[('Hsieh', 'English'), ('McDonald', 'Spanish'), ('Kharlamov', 'Arabic'), ('Vasiliev', 'English'), ('Bellucci', 'English'), ('Wallace', 'English'), ('Chao', 'Chinese'), ('Boylan', 'English'), ('Burgess', 'Chinese'), ('Cattaneo', 'English')]
In [14]:
Copied!
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
Predicting Nationalities: 0%| | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`. /mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2674: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert). warnings.warn(
In [ ]:
Copied!
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
In [ ]:
Copied!