01_preprocessing

In [1]:

            
                Copied!
                
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

In [104]:

            
                Copied!
                
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)

In [13]:

            
                Copied!
                
from unidecode import unidecode
from unidecode import unidecode

data/raw/name_dataset/
- google drive link

In [247]:

            
                Copied!
                
datasets = os.listdir("data/raw/name_dataset/data")
datasets = os.listdir("data/raw/name_dataset/data")

In [277]:

            
                Copied!
                
                    
                    
                
                

        
df = pd.DataFrame()
for dataset in datasets:
    if "csv" in dataset:
        df_temp = pd.read_csv(
            "data/raw/name_dataset/data/" + dataset,
            names=["firstname", "surname", "gender", "country_code"],
        )
        df_temp = df_temp.dropna()
        df_temp["country"] = get_country_name(
            df=df_temp, country_name_col="country_code"
        )
        df_temp = df_temp.drop(columns=["firstname", "gender", "country_code"])
        df_temp["surname"] = df_temp["surname"].apply(unidecode)
        df = pd.concat([df, df_temp])
df.to_parquet("data/preprocessed/name_dataset.parquet")
df = pd.DataFrame()
for dataset in datasets:
    if "csv" in dataset:
        df_temp = pd.read_csv(
            "data/raw/name_dataset/data/" + dataset,
            names=["firstname", "surname", "gender", "country_code"],
        )
        df_temp = df_temp.dropna()
        df_temp["country"] = get_country_name(
            df=df_temp, country_name_col="country_code"
        )
        df_temp = df_temp.drop(columns=["firstname", "gender", "country_code"])
        df_temp["surname"] = df_temp["surname"].apply(unidecode)
        df = pd.concat([df, df_temp])
df.to_parquet("data/preprocessed/name_dataset.parquet")

In [278]:

            
                Copied!
                
df.to_parquet("data/preprocessed/name_dataset.parquet")
df.to_parquet("data/preprocessed/name_dataset.parquet")

data/raw/annotated_names_NamePrism.tsv

In [155]:

            
                Copied!
                
df = pd.read_csv("data/raw/annotated_names_NamePrism.tsv", sep="\t")
df = pd.read_csv("data/raw/annotated_names_NamePrism.tsv", sep="\t")

In [156]:

            
                Copied!
                
df.head()
df.head()

Out[156]:

	id	name	ethnicity	country
0	1	Marius Aam	Nordic	Norway
1	2	Aamani	SouthAsian	India
2	3	Abdelkader Aamara	Muslim	Morocco
3	4	Jarle Alex Aambø	Nordic	Norway
4	5	Syed Aamer Ali	Muslim	Pakistan

In [157]:

            
                Copied!
                
len(df)
len(df)

Out[157]:

In [158]:

            
                Copied!
                
df = df.drop(columns=["id", "ethnicity"])
df = df.drop(columns=["id", "ethnicity"])

In [159]:

            
                Copied!
                
df["name"] = df["name"].apply(unidecode)
df["name"] = df["name"].apply(unidecode)

In [160]:

            
                Copied!
                
                    
                    
                
                

        
words_to_filter = [
    "daughter",
    "Prince",
    "Chief",
    "Reverend",
    " with",
    "admiral",
    "The",
    "Count",
    " from",
    "aka",
    "Prof",
    "Lieutenant",
    "Princess",
    "Windysport",
    " as",
    " is",
    " at",
    " to",
    "nicknamed",
    " name",
    " by",
    " of",
    " and",
    " in",
    "Personal",
    "Honour",
    "postage",
    "Professor",
    " Award",
    "General",
    "Admiral",
    "Born",
    "Website",
    "'",
    "\)",
    "\(",
    "Lady",
    "Lord",
]

for word in words_to_filter:
    df = df[~df["name"].str.contains(word)]

# filter acronyms
df = df[~df["name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]
words_to_filter = [
    "daughter",
    "Prince",
    "Chief",
    "Reverend",
    " with",
    "admiral",
    "The",
    "Count",
    " from",
    "aka",
    "Prof",
    "Lieutenant",
    "Princess",
    "Windysport",
    " as",
    " is",
    " at",
    " to",
    "nicknamed",
    " name",
    " by",
    " of",
    " and",
    " in",
    "Personal",
    "Honour",
    "postage",
    "Professor",
    " Award",
    "General",
    "Admiral",
    "Born",
    "Website",
    "'",
    "\)",
    "\(",
    "Lady",
    "Lord",
]

for word in words_to_filter:
    df = df[~df["name"].str.contains(word)]

# filter acronyms
df = df[~df["name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]

In [161]:

            
                Copied!
                
df["surname"] = df["name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
df["surname"] = df["name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]

In [143]:

            
                Copied!
                
len(a)
len(a)

Out[143]:

In [163]:

            
                Copied!
                
df.drop(columns=["name"]).to_parquet(
    "data/preprocessed/annotated_names_NamePrism.parquet"
)
df.drop(columns=["name"]).to_parquet(
    "data/preprocessed/annotated_names_NamePrism.parquet"
)

kaggle surname-dataset-classification
- data/raw/surname-nationality.csv

In [164]:

            
                Copied!
                
df = pd.read_csv("data/raw/surname-nationality.csv")
df = pd.read_csv("data/raw/surname-nationality.csv")

In [167]:

            
                Copied!
                
df = df[["surname", "nationality"]]
df = df[["surname", "nationality"]]

In [168]:

            
                Copied!
                
df["nationality"].unique()
df["nationality"].unique()

Out[168]:

array(['Ethiopian', 'Honduran', 'Nigerian', 'Malaysian', 'Chilean',
       'Portuguese', 'Papua New Guinean', 'Algerian', 'Brazilian',
       'Venezuelan', 'Ukrainian', 'South African', 'Nicaraguan',
       'Moroccan', 'Finnish', 'Mexican', 'Palestinian', 'Nepalese',
       'Peruvian', 'Dutch', 'Arabic', 'Irish', 'Spanish', 'French',
       'German', 'English', 'Korean', 'Indian', 'Vietnamese', 'Scottish',
       'Japanese', 'Polish', 'Greek', 'Czech', 'Italian', 'Russian',
       'Chinese'], dtype=object)

In [169]:

            
                Copied!
                
                    
                    
                
                

        
df["country"] = df["nationality"].replace(
    {
        "Ethiopian": "Ethiopia",
        "Honduran": "Honduras",
        "Nigerian": "Nigeria",
        "Malaysian": "Malaysia",
        "Chilean": "Chile",
        "Portuguese": "Portugal",
        "Papua New Guinean": "Papua New Guinea",
        "Algerian": "Algeria",
        "Brazilian": "Brazil",
        "Venezuelan": "Venezuela",
        "Ukrainian": "Ukraine",
        "South African": "South Africa",
        "Nicaraguan": "Nicaragua",
        "Moroccan": "Morocco",
        "Finnish": "Finland",
        "Mexican": "Mexico",
        "Palestinian": "Palestine",
        "Nepalese": "Nepal",
        "Peruvian": "Peru",
        "Dutch": "Netherlands ",
        "Arabic": "Arabic",
        "Irish": "Ireland",
        "Spanish": "Spain",
        "French": "France",
        "German": "Germany",
        "English": "England",
        "Korean": " Korea",
        "Indian": " India",
        "Vietnamese": "Vietnam",
        "Scottish": "England",
        "Japanese": "Japan",
        "Polish": "Poland",
        "Greek": "Greece",
        "Czech": "Czechia",
        "Italian": "Italy",
        "Russian": "Russia",
        "Chinese": "China",
    }
)
df["country"] = df["nationality"].replace(
    {
        "Ethiopian": "Ethiopia",
        "Honduran": "Honduras",
        "Nigerian": "Nigeria",
        "Malaysian": "Malaysia",
        "Chilean": "Chile",
        "Portuguese": "Portugal",
        "Papua New Guinean": "Papua New Guinea",
        "Algerian": "Algeria",
        "Brazilian": "Brazil",
        "Venezuelan": "Venezuela",
        "Ukrainian": "Ukraine",
        "South African": "South Africa",
        "Nicaraguan": "Nicaragua",
        "Moroccan": "Morocco",
        "Finnish": "Finland",
        "Mexican": "Mexico",
        "Palestinian": "Palestine",
        "Nepalese": "Nepal",
        "Peruvian": "Peru",
        "Dutch": "Netherlands ",
        "Arabic": "Arabic",
        "Irish": "Ireland",
        "Spanish": "Spain",
        "French": "France",
        "German": "Germany",
        "English": "England",
        "Korean": " Korea",
        "Indian": " India",
        "Vietnamese": "Vietnam",
        "Scottish": "England",
        "Japanese": "Japan",
        "Polish": "Poland",
        "Greek": "Greece",
        "Czech": "Czechia",
        "Italian": "Italy",
        "Russian": "Russia",
        "Chinese": "China",
    }
)

/tmp/ipykernel_5113/1561251810.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["country"] = df["nationality"].replace({

In [170]:

Out[170]:

	surname	nationality	country
0	Tesfaye	Ethiopian	Ethiopia
1	Mohammed	Ethiopian	Ethiopia
2	Getachew	Ethiopian	Ethiopia
3	Abebe	Ethiopian	Ethiopia
4	Girma	Ethiopian	Ethiopia
...	...	...	...
36236	Yun	Chinese	China
36237	Zhai	Chinese	China
36238	Zhi	Chinese	China
36239	Zhuan	Chinese	China
36240	Zhui	Chinese	China

36241 rows × 3 columns

In [171]:

            
                Copied!
                
df.drop(columns=["nationality"]).to_parquet(
    "data/preprocessed/surname-nationality.parquet"
)
df.drop(columns=["nationality"]).to_parquet(
    "data/preprocessed/surname-nationality.parquet"
)

kaggle surname-dataset-classification
- data/raw/surnames_with_splits.csv

In [172]:

            
                Copied!
                
df = pd.read_csv("data/raw/surnames_with_splits.csv")
df = pd.read_csv("data/raw/surnames_with_splits.csv")

In [174]:

            
                Copied!
                
df = df[["surname", "nationality"]]
df = df[["surname", "nationality"]]

In [175]:

            
                Copied!
                
df["nationality"].unique()
df["nationality"].unique()

Out[175]:

array(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French',
       'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean',
       'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish',
       'Vietnamese'], dtype=object)

In [176]:

            
                Copied!
                
                    
                    
                
                

        
df["country"] = df["nationality"].replace(
    {
        "Arabic": "Arabic",
        "Chinese": "China",
        "Czech": "Czechia",
        "Dutch": "Netherlands ",
        "English": "England",
        "French": "France",
        "German": "Germany",
        "Greek": "Greece",
        "Irish": "Ireland",
        "Italian": "Italy",
        "Japanese": "Japan",
        "Korean": " Korea",
        "Polish": "Poland",
        "Portuguese": "Portugal",
        "Russian": "Russia",
        "Scottish": "England",
        "Spanish": "Spain",
        "Vietnamese": "Vietnam",
    }
)
df["country"] = df["nationality"].replace(
    {
        "Arabic": "Arabic",
        "Chinese": "China",
        "Czech": "Czechia",
        "Dutch": "Netherlands ",
        "English": "England",
        "French": "France",
        "German": "Germany",
        "Greek": "Greece",
        "Irish": "Ireland",
        "Italian": "Italy",
        "Japanese": "Japan",
        "Korean": " Korea",
        "Polish": "Poland",
        "Portuguese": "Portugal",
        "Russian": "Russia",
        "Scottish": "England",
        "Spanish": "Spain",
        "Vietnamese": "Vietnam",
    }
)

/tmp/ipykernel_5113/3673784748.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["country"] = df["nationality"].replace({

In [177]:

            
                Copied!
                
df.drop(columns=["nationality"]).to_parquet(
    "data/preprocessed/surnames_with_splits.parquet"
)
df.drop(columns=["nationality"]).to_parquet(
    "data/preprocessed/surnames_with_splits.parquet"
)

data/raw/final_all_names_code.csv

In [235]:

            
                Copied!
                
                    
                    
                
                

        
df = pd.read_csv("data/raw/final_all_names_code.csv")

import pycountry


def get_country_name(df: pd.DataFrame, country_name_col: str):
    """Returns pd.Series with mapped country iso_a2 to names.

    Args:
        df (pd.Series): input country name pandas series
    """
    dfc = df.copy()

    def _get_country_name_map(country_name: str) -> str:
        """Helper function for get_country_name"""
        try:
            result = pycountry.countries.get(alpha_2=country_name)
            name_str = result.name
        except:
            name_str = "NA"
        return name_str

    dfc["res"] = "NA"
    for country_name in dfc[country_name_col].unique():
        dfc.loc[dfc[country_name_col] == country_name, ["res"]] = _get_country_name_map(
            country_name
        )
    return dfc["res"]
df = pd.read_csv("data/raw/final_all_names_code.csv")

import pycountry


def get_country_name(df: pd.DataFrame, country_name_col: str):
    """Returns pd.Series with mapped country iso_a2 to names.

    Args:
        df (pd.Series): input country name pandas series
    """
    dfc = df.copy()

    def _get_country_name_map(country_name: str) -> str:
        """Helper function for get_country_name"""
        try:
            result = pycountry.countries.get(alpha_2=country_name)
            name_str = result.name
        except:
            name_str = "NA"
        return name_str

    dfc["res"] = "NA"
    for country_name in dfc[country_name_col].unique():
        dfc.loc[dfc[country_name_col] == country_name, ["res"]] = _get_country_name_map(
            country_name
        )
    return dfc["res"]

In [236]:

Out[236]:

	Name	Country_code	Country
0	Amy Johnson	ar_AE	AE
1	Thomas Wright	ar_AE	AE
2	Mr. Marco Flores DDS	ar_AE	AE
3	Marcus Robbins	ar_AE	AE
4	Susan Montgomery	ar_AE	AE
...	...	...	...
404057	Yang Lu	zh_TW	TW
404058	Liu Yuxin	zh_TW	TW
404059	Wu Wei Ting	zh_TW	TW
404060	Li Shuting	zh_TW	TW
404061	Sujiarong	zh_TW	TW

404062 rows × 3 columns

In [237]:

            
                Copied!
                
df["country"] = get_country_name(df=df, country_name_col="Country")
df["country"] = get_country_name(df=df, country_name_col="Country")

In [238]:

            
                Copied!
                
len(df[df["country"] == "NA"]), df["country"].unique()
len(df[df["country"] == "NA"]), df["country"].unique()

Out[238]:

(0,
 array(['United Arab Emirates', 'Egypt', 'Jordan', 'Palestine, State of',
        'Saudi Arabia', 'Azerbaijan', 'Bulgaria', 'Bangladesh',
        'Bosnia and Herzegovina', 'Czechia', 'Denmark', 'Germany',
        'Austria', 'Switzerland', 'Cyprus', 'Greece', 'Australia',
        'Canada', 'United Kingdom', 'Ireland', 'India', 'New Zealand',
        'Philippines', 'Thailand', 'United States', 'Spain', 'Colombia',
        'Mexico', 'Estonia', 'Iran, Islamic Republic of', 'Finland',
        'France', 'Israel', 'Croatia', 'Hungary', 'Armenia', 'Indonesia',
        'Italy', 'Japan', 'Georgia', 'Korea, Republic of',
        "Lao People's Democratic Republic", 'Luxembourg', 'Lithuania',
        'Latvia', 'Malta', 'Nepal', 'Belgium', 'Netherlands', 'Norway',
        'Poland', 'Brazil', 'Portugal', 'Romania', 'Russian Federation',
        'Slovakia', 'Slovenia', 'Sweden', 'Türkiye', 'Ghana', 'Ukraine',
        'China', 'Taiwan, Province of China'], dtype=object))

In [239]:

            
                Copied!
                
df = df.drop(columns=["Country", "Country_code"])
df = df.drop(columns=["Country", "Country_code"])

In [240]:

            
                Copied!
                
df = df.dropna()
df = df.dropna()

In [241]:

            
                Copied!
                
                    
                    
                
                

        
df["Name"] = df["Name"].str.strip()

words_to_filter = [
    "daughter",
    "Prince",
    "Chief",
    "Reverend",
    " with",
    "admiral",
    "The",
    "Count",
    " from",
    "aka",
    "Prof",
    "Lieutenant",
    "Princess",
    "Windysport",
    " as",
    " is",
    " at",
    " to",
    "nicknamed",
    " name",
    " by",
    " of",
    " and",
    " in",
    "Personal",
    "Honour",
    "postage",
    "Professor",
    " Award",
    "General",
    "Admiral",
    "Born",
    "Website",
    "'",
    "\)",
    "\(",
    "Lady",
    "Lord",
]

for word in words_to_filter:
    df = df[~df["Name"].str.contains(word)]

# filter acronyms
df = df[~df["Name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]

df["surname"] = df["Name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
df["Name"] = df["Name"].str.strip()

words_to_filter = [
    "daughter",
    "Prince",
    "Chief",
    "Reverend",
    " with",
    "admiral",
    "The",
    "Count",
    " from",
    "aka",
    "Prof",
    "Lieutenant",
    "Princess",
    "Windysport",
    " as",
    " is",
    " at",
    " to",
    "nicknamed",
    " name",
    " by",
    " of",
    " and",
    " in",
    "Personal",
    "Honour",
    "postage",
    "Professor",
    " Award",
    "General",
    "Admiral",
    "Born",
    "Website",
    "'",
    "\)",
    "\(",
    "Lady",
    "Lord",
]

for word in words_to_filter:
    df = df[~df["Name"].str.contains(word)]

# filter acronyms
df = df[~df["Name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]

df["surname"] = df["Name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]

In [243]:

            
                Copied!
                
df[df["country"] == "Slovakia"]
df[df["country"] == "Slovakia"]

Out[243]:

	Name	country	surname
345072	Brooke Griffin	Slovakia	Griffin
345073	Derrick Gay	Slovakia	Gay
345074	James Villegas	Slovakia	Villegas
345075	Johnny Reed	Slovakia	Reed
345076	Meagan Cooper	Slovakia	Cooper
...	...	...	...
348392	Crystal Frost	Slovakia	Frost
348393	Traci Reyes	Slovakia	Reyes
348394	Nichole Bowen	Slovakia	Bowen
348395	Sarah Gregory	Slovakia	Gregory
348396	Cheryl Baker	Slovakia	Baker

3179 rows × 3 columns

hmm, this looks like a BS dataset!!!

data/raw/name2lang.txt

In [190]:

            
                Copied!
                
df = pd.read_csv("data/raw/name2lang.txt", names=["surname", "nationality"])
df = pd.read_csv("data/raw/name2lang.txt", names=["surname", "nationality"])

In [191]:

            
                Copied!
                
df["nationality"].unique()
df["nationality"].unique()

Out[191]:

array([' Portuguese', ' Irish', ' Spanish', ' Vietnamese', ' Chinese',
       ' Greek', ' Czech', ' Dutch', ' Japanese', ' French', ' German',
       ' Scottish', ' English', ' Russian', 'Russian', ' Polish',
       ' Arabic', ' Korean', ' Italian'], dtype=object)

In [192]:

            
                Copied!
                
                    
                    
                
                

        
df["country"] = df["nationality"].replace(
    {
        " Arabic": "Arabic",
        " Chinese": "China",
        " Czech": "Czechia",
        " Dutch": "Netherlands ",
        " English": "England",
        " French": "France",
        " German": "Germany",
        " Greek": "Greece",
        " Irish": "Ireland",
        " Italian": "Italy",
        " Japanese": "Japan",
        " Korean": " Korea",
        " Polish": "Poland",
        " Portuguese": "Portugal",
        "Russian": "Russia",
        " Russian": "Russia",
        " Scottish": "England",
        " Spanish": "Spain",
        " Vietnamese": "Vietnam",
    }
)
df["country"] = df["nationality"].replace(
    {
        " Arabic": "Arabic",
        " Chinese": "China",
        " Czech": "Czechia",
        " Dutch": "Netherlands ",
        " English": "England",
        " French": "France",
        " German": "Germany",
        " Greek": "Greece",
        " Irish": "Ireland",
        " Italian": "Italy",
        " Japanese": "Japan",
        " Korean": " Korea",
        " Polish": "Poland",
        " Portuguese": "Portugal",
        "Russian": "Russia",
        " Russian": "Russia",
        " Scottish": "England",
        " Spanish": "Spain",
        " Vietnamese": "Vietnam",
    }
)

In [193]:

Out[193]:

	surname	nationality	country
0	Abreu	Portuguese	Portugal
1	Albuquerque	Portuguese	Portugal
2	Almeida	Portuguese	Portugal
3	Alves	Portuguese	Portugal
4	Araujo	Portuguese	Portugal
...	...	...	...
20045	Zappa	Italian	Italy
20046	Zeni	Italian	Italy
20047	Zini	Italian	Italy
20048	Zino	Italian	Italy
20049	Zunino	Italian	Italy

20050 rows × 3 columns

In [194]:

            
                Copied!
                
df.drop(columns=["nationality"]).to_parquet("data/preprocessed/name2lang.parquet")
df.drop(columns=["nationality"]).to_parquet("data/preprocessed/name2lang.parquet")