THIS IS A WORK IN PROGRESS¶
Surname origin country classification¶
- approach shamelessly copied from https://www.kaggle.com/code/yonatankpl/surname-classification-with-bert
- NOTE from author:
Based on the discussion [here](https://www.kaggle.com/competitions/playground-series-s4e1/discussion/465517), I created a Bert based Surname classifier.
- possible improvement of the base dataset by scraping wiki: https://github.com/greenelab/wiki-nationality-estimate
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../../...")
import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
BertForSequenceClassification,
get_linear_schedule_with_warmup,
BertTokenizer,
)
from transformers import BertForSequenceClassification
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../../...")
import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
BertForSequenceClassification,
get_linear_schedule_with_warmup,
BertTokenizer,
)
from transformers import BertForSequenceClassification
In [2]:
Copied!
surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")
surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")
In [5]:
Copied!
splitted_data
splitted_data
Out[5]:
nationality | nationality_index | split | surname | |
---|---|---|---|---|
0 | Arabic | 15 | train | Totah |
1 | Arabic | 15 | train | Abboud |
2 | Arabic | 15 | train | Fakhoury |
3 | Arabic | 15 | train | Srour |
4 | Arabic | 15 | train | Sayegh |
... | ... | ... | ... | ... |
10975 | Vietnamese | 11 | test | Dinh |
10976 | Vietnamese | 11 | test | Phung |
10977 | Vietnamese | 11 | test | Quang |
10978 | Vietnamese | 11 | test | Vu |
10979 | Vietnamese | 11 | test | Ha |
10980 rows × 4 columns
In [6]:
Copied!
surname_data.surname.nunique()
surname_data.surname.nunique()
Out[6]:
30923
In [4]:
Copied!
splitted_data["nationality_index"]
splitted_data["nationality_index"]
Out[4]:
0 15 1 15 2 15 3 15 4 15 .. 10975 11 10976 11 10977 11 10978 11 10979 11 Name: nationality_index, Length: 10980, dtype: int64
In [3]:
Copied!
labeld_decoder = dict(
zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}
labeld_decoder = dict(
zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}
In [4]:
Copied!
# Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
splitted_data["split"] == "val"
] # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]
# Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
splitted_data["split"] == "val"
] # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]
In [5]:
Copied!
# Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)
# Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)
In [8]:
Copied!
batch_size = 32
train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
batch_size = 32
train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
In [9]:
Copied!
num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
"bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0.2, # Usually a fraction of total_steps
num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
"bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0.2, # Usually a fraction of total_steps
num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. /mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn(
Out[9]:
BertForSequenceClassification( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(28996, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0-11): 12 x BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.4, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (dropout): Dropout(p=0.4, inplace=False) (classifier): Linear(in_features=768, out_features=18, bias=True) )
In [10]:
Copied!
trained_model = surname_classification.train_model(
model,
train_dataloader,
val_dataloader,
optimizer,
scheduler,
device,
num_epochs=num_epochs,
)
trained_model = surname_classification.train_model(
model,
train_dataloader,
val_dataloader,
optimizer,
scheduler,
device,
num_epochs=num_epochs,
)
log from running a script on a server
(ecovadis) pmulinka@supercom-wssonata:~/assignment/ecovadis_assignment$ python notebooks/eda/surname_classification_with_bert.py
/home/pmulinka/assignment/ecovadis_assignment
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/pmulinka/assignment/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Epoch 1/10
Training Loss: 2.394
Validation Loss: 2.256, Validation Accuracy: 0.267
Epoch 2/10
Training Loss: 2.267
Validation Loss: 2.251, Validation Accuracy: 0.267
Epoch 3/10
Training Loss: 2.265
Validation Loss: 2.243, Validation Accuracy: 0.306
Epoch 4/10
Training Loss: 2.250
Validation Loss: 2.237, Validation Accuracy: 0.385
Epoch 5/10
Training Loss: 2.251
Validation Loss: 2.227, Validation Accuracy: 0.317
Epoch 6/10
Training Loss: 1.620
Validation Loss: 0.982, Validation Accuracy: 0.735
Epoch 7/10
Training Loss: 1.062
Validation Loss: 0.826, Validation Accuracy: 0.772
Epoch 8/10
Training Loss: 0.911
Validation Loss: 0.799, Validation Accuracy: 0.785
Epoch 9/10
Training Loss: 0.832
Validation Loss: 0.775, Validation Accuracy: 0.791
Epoch 10/10
Training Loss: 0.780
Validation Loss: 0.768, Validation Accuracy: 0.790
Training complete
Evaluation on test dataset yield
Loss: 0.7684005849206677
Accuracy: 0.7960164835164836
In [ ]:
Copied!
loss, accuracy = surname_classification.evaluate_model(
trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
loss, accuracy = surname_classification.evaluate_model(
trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
In [8]:
Copied!
inverse_label_dict
inverse_label_dict
Out[8]:
{15: 'Arabic', 3: 'Chinese', 5: 'Czech', 2: 'Dutch', 12: 'English', 8: 'French', 9: 'German', 4: 'Greek', 1: 'Irish', 17: 'Italian', 7: 'Japanese', 16: 'Korean', 14: 'Polish', 0: 'Portuguese', 13: 'Russian', 10: 'Scottish', 6: 'Spanish', 11: 'Vietnamese'}
In [7]:
Copied!
utils.dill_dump(
file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
utils.dill_dump(
file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
In [ ]:
Copied!
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
In [10]:
Copied!
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
In [15]:
Copied!
predicted_surnames
predicted_surnames
Out[15]:
[('Hsieh', 'English'), ('McDonald', 'Spanish'), ('Kharlamov', 'Arabic'), ('Vasiliev', 'English'), ('Bellucci', 'English'), ('Wallace', 'English'), ('Chao', 'Chinese'), ('Boylan', 'English'), ('Burgess', 'Chinese'), ('Cattaneo', 'English')]
In [14]:
Copied!
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
Predicting Nationalities: 0%| | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`. /mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2674: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert). warnings.warn(
In [ ]:
Copied!
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
In [ ]:
Copied!