THIS IS A WORK IN PROGRESS¶
Surname origin country classification¶
- approach shamelessly copied from https://www.kaggle.com/code/yonatankpl/surname-classification-with-bert
 - NOTE from author:
Based on the discussion [here](https://www.kaggle.com/competitions/playground-series-s4e1/discussion/465517), I created a Bert based Surname classifier. - possible improvement of the base dataset by scraping wiki: https://github.com/greenelab/wiki-nationality-estimate
 
In [1]:
                Copied!
                
                
            import os
import sys
sys.path.append(os.getcwd())
os.chdir("../../...")
import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from transformers import BertForSequenceClassification
import os
import sys
sys.path.append(os.getcwd())
os.chdir("../../...")
import pandas as pd
import torch
from churn_pred.preprocessing import surname_classification
from churn_pred import utils
from torch.utils.data import DataLoader
import pickle
from transformers import (
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertTokenizer,
)
from transformers import BertForSequenceClassification
    
        In [2]:
                Copied!
                
                
            surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")
surname_data = pd.read_csv("data/surnames/surname-nationality.csv")
splitted_data = pd.read_csv("data/surnames/surnames_with_splits.csv")
    
        In [5]:
                Copied!
                
                
            splitted_data
splitted_data
    
        Out[5]:
| nationality | nationality_index | split | surname | |
|---|---|---|---|---|
| 0 | Arabic | 15 | train | Totah | 
| 1 | Arabic | 15 | train | Abboud | 
| 2 | Arabic | 15 | train | Fakhoury | 
| 3 | Arabic | 15 | train | Srour | 
| 4 | Arabic | 15 | train | Sayegh | 
| ... | ... | ... | ... | ... | 
| 10975 | Vietnamese | 11 | test | Dinh | 
| 10976 | Vietnamese | 11 | test | Phung | 
| 10977 | Vietnamese | 11 | test | Quang | 
| 10978 | Vietnamese | 11 | test | Vu | 
| 10979 | Vietnamese | 11 | test | Ha | 
10980 rows × 4 columns
In [6]:
                Copied!
                
                
            surname_data.surname.nunique()
surname_data.surname.nunique()
    
        Out[6]:
30923
In [4]:
                Copied!
                
                
            splitted_data["nationality_index"]
splitted_data["nationality_index"]
    
        Out[4]:
0        15
1        15
2        15
3        15
4        15
         ..
10975    11
10976    11
10977    11
10978    11
10979    11
Name: nationality_index, Length: 10980, dtype: int64
In [3]:
                Copied!
                
                
            labeld_decoder = dict(
    zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}
labeld_decoder = dict(
    zip(splitted_data["nationality"], splitted_data["nationality_index"])
)
inverse_label_dict = {v: k for k, v in labeld_decoder.items()}
    
        In [4]:
                Copied!
                
                
            # Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
    splitted_data["split"] == "val"
]  # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]
# Split data into train, validation, and test sets
train_df = splitted_data[splitted_data["split"] == "train"]
val_df = splitted_data[
    splitted_data["split"] == "val"
]  # Assuming "val" is used for validation set
test_df = splitted_data[splitted_data["split"] == "test"]
    
        In [5]:
                Copied!
                
                
            # Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)
# Tokenize each dataset
train_encodings = surname_classification.tokenize_data(train_df)
val_encodings = surname_classification.tokenize_data(val_df)
test_encodings = surname_classification.tokenize_data(test_df)
    
        In [8]:
                Copied!
                
                
            batch_size = 32
train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
batch_size = 32
train_dataset = surname_classification.create_dataset(train_encodings, train_df)
val_dataset = surname_classification.create_dataset(val_encodings, val_df)
test_dataset = surname_classification.create_dataset(test_encodings, test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
        In [9]:
                Copied!
                
                
            num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels = splitted_data["nationality_index"].nunique()
num_epochs = 10
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=num_labels, hidden_dropout_prob=0.4
)
# freeze layers
for param in model.bert.parameters():
    param.requires_grad = False
optimizer = surname_classification.create_optimizer(model, learning_rate=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.2,  # Usually a fraction of total_steps
    num_training_steps=total_steps,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
    
        Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. /mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn(
Out[9]:
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.4, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.4, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.4, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (classifier): Linear(in_features=768, out_features=18, bias=True)
)
In [10]:
                Copied!
                
                
            trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)
trained_model = surname_classification.train_model(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs,
)
    
        log from running a script on a server
(ecovadis) pmulinka@supercom-wssonata:~/assignment/ecovadis_assignment$ python notebooks/eda/surname_classification_with_bert.py
/home/pmulinka/assignment/ecovadis_assignment
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/pmulinka/assignment/ecovadis/lib/python3.10/site-packages/transformers/optimization.py:521: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
Epoch 1/10
Training Loss: 2.394
Validation Loss: 2.256, Validation Accuracy: 0.267
Epoch 2/10
Training Loss: 2.267
Validation Loss: 2.251, Validation Accuracy: 0.267
Epoch 3/10
Training Loss: 2.265
Validation Loss: 2.243, Validation Accuracy: 0.306
Epoch 4/10
Training Loss: 2.250
Validation Loss: 2.237, Validation Accuracy: 0.385
Epoch 5/10
Training Loss: 2.251
Validation Loss: 2.227, Validation Accuracy: 0.317
Epoch 6/10
Training Loss: 1.620
Validation Loss: 0.982, Validation Accuracy: 0.735
Epoch 7/10
Training Loss: 1.062
Validation Loss: 0.826, Validation Accuracy: 0.772
Epoch 8/10
Training Loss: 0.911
Validation Loss: 0.799, Validation Accuracy: 0.785
Epoch 9/10
Training Loss: 0.832
Validation Loss: 0.775, Validation Accuracy: 0.791
Epoch 10/10
Training Loss: 0.780
Validation Loss: 0.768, Validation Accuracy: 0.790
Training complete
Evaluation on test dataset yield
Loss: 0.7684005849206677
Accuracy: 0.7960164835164836
In [ ]:
                Copied!
                
                
            loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
loss, accuracy = surname_classification.evaluate_model(
    trained_model, test_dataloader, device
)
print(f"Evaluation on test dataset yield\nLoss: {loss}\nAccuracy: {accuracy}")
    
        In [8]:
                Copied!
                
                
            inverse_label_dict
inverse_label_dict
    
        Out[8]:
{15: 'Arabic',
 3: 'Chinese',
 5: 'Czech',
 2: 'Dutch',
 12: 'English',
 8: 'French',
 9: 'German',
 4: 'Greek',
 1: 'Irish',
 17: 'Italian',
 7: 'Japanese',
 16: 'Korean',
 14: 'Polish',
 0: 'Portuguese',
 13: 'Russian',
 10: 'Scottish',
 6: 'Spanish',
 11: 'Vietnamese'}
In [7]:
                Copied!
                
                
            utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
utils.dill_dump(
    file_loc="data/surnames/inverse_label_dict.dill", content=inverse_label_dict
)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
    
        In [ ]:
                Copied!
                
                
            # Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
# Save the model
torch.save(trained_model.state_dict(), "surname_model_state_dict.pth")
torch.save(trained_model, "data/surnames/surname_model.pth")
utils.json_dump(
    file_loc="data/surnames/inverse_label_dict.json", content=inverse_label_dict
)
    
        In [10]:
                Copied!
                
                
            # Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("data/surnames/surname_model.pth", map_location=device)
inverse_label_dict = utils.dill_load(file_loc="data/surnames/inverse_label_dict.dill")
# model.eval()
# trained_model.load_state_dict(torch.load("surname_model.pth"))
    
        In [15]:
                Copied!
                
                
            predicted_surnames
predicted_surnames
    
        Out[15]:
[('Hsieh', 'English'),
 ('McDonald', 'Spanish'),
 ('Kharlamov', 'Arabic'),
 ('Vasiliev', 'English'),
 ('Bellucci', 'English'),
 ('Wallace', 'English'),
 ('Chao', 'Chinese'),
 ('Boylan', 'English'),
 ('Burgess', 'Chinese'),
 ('Cattaneo', 'English')]
In [14]:
                Copied!
                
                
            df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
df_pd = pd.read_csv("data/dataset.csv")
uq_surnames = df_pd.Surname.unique()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
predicted_surnames = surname_classification.predict_nationality(
    trained_model, uq_surnames[:10], tokenizer, inverse_label_dict, device
)
    
        Predicting Nationalities:   0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
/mnt/c/#work/ecovadis/ecovadis/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2674: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
  warnings.warn(
                                                                         
In [ ]:
                Copied!
                
                
            predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
predicted_df = pd.DataFrame(predicted_surnames, columns=["Surname", "Nationality"])
predicted_df.to_csv("predicted_surnames_dataset.csv")
predicted_df.head()
    
        In [ ]:
                Copied!