THIS IS A WORK IN PROGRESS¶

Surname origin country classification¶

approach shamelessly copied from https://www.kaggle.com/code/yonatankpl/surname-classification-with-bert

NOTE from author:

Based on the discussion [here](https://www.kaggle.com/competitions/playground-series-s4e1/discussion/465517), I created a Bert based Surname classifier.

possible improvement of the base dataset by scraping wiki: https://github.com/greenelab/wiki-nationality-estimate

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../../...")

import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from transformers import BertForSequenceClassification
import os
import sys

sys.path.append(os.getcwd())
os.chdir("../../...")

import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from transformers import BertForSequenceClassification

In [2]:

            
                Copied!
                
surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")
surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")

In [5]:

            
                Copied!
                
splitted_data
splitted_data

Out[5]:

	nationality	nationality_index	split	surname
0	Arabic	15	train	Totah
1	Arabic	15	train	Abboud
2	Arabic	15	train	Fakhoury
3	Arabic	15	train	Srour
4	Arabic	15	train	Sayegh
...	...	...	...	...
10975	Vietnamese	11	test	Dinh
10976	Vietnamese	11	test	Phung
10977	Vietnamese	11	test	Quang
10978	Vietnamese	11	test	Vu
10979	Vietnamese	11	test	Ha

10980 rows × 4 columns

In [6]:

            
                Copied!
                
surname_data.surname.nunique()
surname_data.surname.nunique()

Out[6]:

In [4]:

            
                Copied!
                
splitted_data["nationality_index"]
splitted_data["nationality_index"]

Out[4]:

0        15
1        15
2        15
3        15
4        15
         ..
10975    11
10976    11
10977    11
10978    11
10979    11
Name: nationality_index, Length: 10980, dtype: int64

In [3]:

            
                Copied!
                
labeld_decoder = dict(
    zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}
labeld_decoder = dict(
    zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}

In [4]:

            
                Copied!
                
                    
                    
                
                

        
# Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
    splitted_data["split"] == "val"
]  # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]
# Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
    splitted_data["split"] == "val"
]  # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]

In [5]:

            
                Copied!
                
# Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)
# Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)

In [8]:

            
                Copied!
                
batch_size = 32

train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
batch_size = 32

train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:

            
                Copied!
                
                    
                    
                
                

        
num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False

optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False

optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

Out[9]:

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.4, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.4, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.4, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (classifier): Linear(in_features=768, out_features=18, bias=True)
)

In [10]:

            
                Copied!
                
                    
                    
                
                

        
trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)
trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)

log from running a script on a server

(ecovadis) pmulinka@supercom-wssonata:~/assignment/ecovadis_assignment$ python notebooks/eda/surname_classification_with_bert.py
/home/pmulinka/assignment/ecovadis_assignment
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/pmulinka/assignment/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

Epoch 1/10
Training Loss: 2.394
Validation Loss: 2.256, Validation Accuracy: 0.267

Epoch 2/10
Training Loss: 2.267
Validation Loss: 2.251, Validation Accuracy: 0.267

Epoch 3/10
Training Loss: 2.265
Validation Loss: 2.243, Validation Accuracy: 0.306

Epoch 4/10
Training Loss: 2.250
Validation Loss: 2.237, Validation Accuracy: 0.385

Epoch 5/10
Training Loss: 2.251
Validation Loss: 2.227, Validation Accuracy: 0.317

Epoch 6/10
Training Loss: 1.620
Validation Loss: 0.982, Validation Accuracy: 0.735

Epoch 7/10
Training Loss: 1.062
Validation Loss: 0.826, Validation Accuracy: 0.772

Epoch 8/10
Training Loss: 0.911
Validation Loss: 0.799, Validation Accuracy: 0.785

Epoch 9/10
Training Loss: 0.832
Validation Loss: 0.775, Validation Accuracy: 0.791

Epoch 10/10
Training Loss: 0.780
Validation Loss: 0.768, Validation Accuracy: 0.790
Training complete
Evaluation on test dataset yield
Loss: 0.7684005849206677
Accuracy: 0.7960164835164836

In [ ]:

            
                Copied!
                
loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")

In [8]:

            
                Copied!
                
inverse_label_dict
inverse_label_dict

Out[8]:

{15: 'Arabic',
 3: 'Chinese',
 5: 'Czech',
 2: 'Dutch',
 12: 'English',
 8: 'French',
 9: 'German',
 4: 'Greek',
 1: 'Irish',
 17: 'Italian',
 7: 'Japanese',
 16: 'Korean',
 14: 'Polish',
 0: 'Portuguese',
 13: 'Russian',
 10: 'Scottish',
 6: 'Spanish',
 11: 'Vietnamese'}

In [7]:

            
                Copied!
                
utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)

In [10]:

            
                Copied!
                
                    
                    
                
                

        
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))

In [15]:

            
                Copied!
                
predicted_surnames
predicted_surnames

Out[15]:

[('Hsieh', 'English'),
 ('McDonald', 'Spanish'),
 ('Kharlamov', 'Arabic'),
 ('Vasiliev', 'English'),
 ('Bellucci', 'English'),
 ('Wallace', 'English'),
 ('Chao', 'Chinese'),
 ('Boylan', 'English'),
 ('Burgess', 'Chinese'),
 ('Cattaneo', 'English')]

In [14]:

            
                Copied!
                
                    
                    
                
                

        
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)

Predicting Nationalities:   0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
/mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2674: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
  warnings.warn(

In [ ]:

            
                Copied!
                
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()

In [ ]: