THIS IS A WORK IN PROGRESS¶

Surname origin country classification¶

approach shamelessly copied from https://www.kaggle.com/code/yonatankpl/surname-classification-with-bert

NOTE from author:

Based on the discussion [here](https://www.kaggle.com/competitions/playground-series-s4e1/discussion/465517), I created a Bert based Surname classifier.

possible improvement of the base dataset by scraping wiki: https://github.com/greenelab/wiki-nationality-estimate

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd
import torch
from src import surname_classification
from src import utils
from src.label_encoder import LabelEncoder
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from itertools import islice
from sklearn.model_selection import train_test_split
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd
import torch
from src import surname_classification
from src import utils
from src.label_encoder import LabelEncoder
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from itertools import islice
from sklearn.model_selection import train_test_split

In [3]:

            
                Copied!
                
# surname_data = pd.read_csv("data/raw/surname-nationality.csv")
# splitted_data = pd.read_csv("data/raw/surnames_with_splits.csv")
# surname_data = pd.read_csv("data/raw/surname-nationality.csv")
# splitted_data = pd.read_csv("data/raw/surnames_with_splits.csv")

In [2]:

            
                Copied!
                
df = pd.read_parquet("data/preprocessed/final_dataset.parquet")
df = pd.read_parquet("data/preprocessed/final_dataset.parquet")

In [3]:

            
                Copied!
                
df.head()
df.head()

Out[3]:

	surname	country
0	Mengel	Estonia
1	Saaremae	Estonia
2	Rikkiev	Estonia
3	Est	Estonia
4	Villandi	Estonia

In [4]:

            
                Copied!
                
label_encoder = LabelEncoder(columns_to_encode=["country"])
df = label_encoder.fit_transform(df)
# df = self.label_encoder.transform(df)
label_encoder = LabelEncoder(columns_to_encode=["country"])
df = label_encoder.fit_transform(df)
# df = self.label_encoder.transform(df)

In [13]:

            
                Copied!
                
list(islice(label_encoder.encoding_dict["country"].items(), 5))
list(islice(label_encoder.encoding_dict["country"].items(), 5))

Out[13]:

[('Estonia', 1), ('Bolivia', 2), ('Tunisia', 3), ('England', 4), ('Russia', 5)]

In [6]:

            
                Copied!
                
df.head()
df.head()

Out[6]:

	surname	country
0	Mengel	1
1	Saaremae	1
2	Rikkiev	1
3	Est	1
4	Villandi	1

In [7]:

            
                Copied!
                
                    
                    
                
                

        
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
    df, test_size=valid_size, stratify=df["country"], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid["country"],
    random_state=random_state,
)
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
    df, test_size=valid_size, stratify=df["country"], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid["country"],
    random_state=random_state,
)

In [ ]:

            
                Copied!
                
# Tokenize each dataset
# train_encodings = surname_classification.tokenize_data(df_train, surname_col="surname")
# val_encodings = surname_classification.tokenize_data(df_val, surname_col="surname")
test_encodings = surname_classification.tokenize_data(df_test, surname_col="surname")
# Tokenize each dataset
# train_encodings = surname_classification.tokenize_data(df_train, surname_col="surname")
# val_encodings = surname_classification.tokenize_data(df_val, surname_col="surname")
test_encodings = surname_classification.tokenize_data(df_test, surname_col="surname")

In [8]:

            
                Copied!
                
                    
                    
                
                

        
batch_size = 32

dataset_train = surname_classification.create_dataset(
    train_encodings, df_train, labels="country"
)
dataset_val = surname_classification.create_dataset(
    val_encodings, df_val, labels="country"
)
dataset_test = surname_classification.create_dataset(
    test_encodings, df_test, labels="country"
)

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)
batch_size = 32

dataset_train = surname_classification.create_dataset(
    train_encodings, df_train, labels="country"
)
dataset_val = surname_classification.create_dataset(
    val_encodings, df_val, labels="country"
)
dataset_test = surname_classification.create_dataset(
    test_encodings, df_test, labels="country"
)

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

In [9]:

            
                Copied!
                
                    
                    
                
                

        
num_labels = df["country"].nunique()
num_epochs = 2
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)

# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False

optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels = df["country"].nunique()
num_epochs = 2
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)

# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False

optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

Out[9]:

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.4, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.4, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.4, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (classifier): Linear(in_features=768, out_features=18, bias=True)
)

In [10]:

            
                Copied!
                
                    
                    
                
                

        
trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)
trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)

log from running a script on a server

(ecovadis) pmulinka@supercom-wssonata:~/assignment/ecovadis_assignment$ python notebooks/eda/surname_classification_with_bert.py
/home/pmulinka/assignment/ecovadis_assignment
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/pmulinka/assignment/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

Epoch 1/10
Training Loss: 2.394
Validation Loss: 2.256, Validation Accuracy: 0.267

Epoch 2/10
Training Loss: 2.267
Validation Loss: 2.251, Validation Accuracy: 0.267

Epoch 3/10
Training Loss: 2.265
Validation Loss: 2.243, Validation Accuracy: 0.306

Epoch 4/10
Training Loss: 2.250
Validation Loss: 2.237, Validation Accuracy: 0.385

Epoch 5/10
Training Loss: 2.251
Validation Loss: 2.227, Validation Accuracy: 0.317

Epoch 6/10
Training Loss: 1.620
Validation Loss: 0.982, Validation Accuracy: 0.735

Epoch 7/10
Training Loss: 1.062
Validation Loss: 0.826, Validation Accuracy: 0.772

Epoch 8/10
Training Loss: 0.911
Validation Loss: 0.799, Validation Accuracy: 0.785

Epoch 9/10
Training Loss: 0.832
Validation Loss: 0.775, Validation Accuracy: 0.791

Epoch 10/10
Training Loss: 0.780
Validation Loss: 0.768, Validation Accuracy: 0.790
Training complete
Evaluation on test dataset yield
Loss: 0.7684005849206677
Accuracy: 0.7960164835164836

In [ ]:

            
                Copied!
                
loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")

In [7]:

            
                Copied!
                
utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)

In [10]:

            
                Copied!
                
                    
                    
                
                

        
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))

In [15]:

            
                Copied!
                
predicted_surnames
predicted_surnames

Out[15]:

[('Hsieh', 'English'),
 ('McDonald', 'Spanish'),
 ('Kharlamov', 'Arabic'),
 ('Vasiliev', 'English'),
 ('Bellucci', 'English'),
 ('Wallace', 'English'),
 ('Chao', 'Chinese'),
 ('Boylan', 'English'),
 ('Burgess', 'Chinese'),
 ('Cattaneo', 'English')]

In [14]:

            
                Copied!
                
                    
                    
                
                

        
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)

Predicting Nationalities:   0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
/mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2674: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
  warnings.warn(

In [ ]:

            
                Copied!
                
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()

In [ ]: