02_preprocessing_merge
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
from unidecode import unidecode
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
from unidecode import unidecode
In [3]:
Copied!
datasets = os.listdir("data/preprocessed")
datasets
datasets = os.listdir("data/preprocessed")
datasets
Out[3]:
['name_dataset.parquet', 'surname-nationality.parquet', 'name2lang.parquet', 'surnames_with_splits.parquet', 'annotated_names_NamePrism.parquet']
In [5]:
Copied!
df = pd.DataFrame()
for dataset in datasets:
if "parquet" in dataset:
df_temp = pd.read_parquet("data/preprocessed/" + dataset)
df = pd.concat([df, df_temp])
# just in case I forgot somewhere
df["surname"] = df["surname"].apply(unidecode)
df["country"] = df["country"].apply(unidecode)
df["surname"] = df["surname"].str.strip()
df["country"] = df["country"].str.strip()
df = pd.DataFrame()
for dataset in datasets:
if "parquet" in dataset:
df_temp = pd.read_parquet("data/preprocessed/" + dataset)
df = pd.concat([df, df_temp])
# just in case I forgot somewhere
df["surname"] = df["surname"].apply(unidecode)
df["country"] = df["country"].apply(unidecode)
df["surname"] = df["surname"].str.strip()
df["country"] = df["country"].str.strip()
In [12]:
Copied!
df["country"].value_counts()
df["country"].value_counts()
country Egypt 44127319 Italy 33761554 United States 29159238 Saudi Arabia 28305845 France 18154084 Colombia 17230722 Iraq 16576059 South Africa 13769164 Mexico 12927636 Malaysia 10971884 Spain 10353376 United Kingdom 10298417 Russian Federation 9836825 Sudan 9363834 Nigeria 8754524 Peru 7846736 Brazil 7778507 United Arab Emirates 6587571 Chile 6480713 India 6073900 Germany 5680128 Netherlands 5127047 Oman 4945215 Israel 3753244 Kuwait 3484771 Bangladesh 3412053 Palestine, State of 3226308 Kazakhstan 3173225 Jordan 3032761 Canada 2957224 Belgium 2938247 Iran, Islamic Republic of 2935583 Singapore 2750451 Bolivia, Plurinational State of 2624762 Poland 2553408 Qatar 2458339 Hong Kong 2336948 Argentina 2283210 Portugal 2121761 Cameroon 1973809 Uruguay 1457692 Panama 1448629 Switzerland 1438389 Guatemala 1414100 Costa Rica 1399004 Ireland 1315101 Czechia 1282157 Finland 1214714 Austria 1166345 Sweden 967551 Ghana 870351 Philippines 837888 Mauritius 798826 Croatia 602518 Denmark 564929 Greece 558120 China 514114 Afghanistan 494042 Albania 474665 Angola 465585 Norway 417465 Japan 408043 Bulgaria 406336 Taiwan, Province of China 396089 Jamaica 355419 Hungary 351434 Macao 306174 Ecuador 281985 Botswana 235195 Slovenia 209300 Lithuania 205304 Brunei Darussalam 198660 Luxembourg 170945 Serbia 151913 Puerto Rico 126177 Indonesia 116847 Malta 107925 Cyprus 107597 Azerbaijan 91387 Georgia 85484 Estonia 83211 Maldives 81916 Moldova, Republic of 41864 Russia 38725 Iceland 26488 Korea, Republic of 18788 Turkmenistan 16043 Honduras 14413 Burundi 13961 Haiti 13781 Djibouti 12367 Ethiopia 12105 England 10583 South Korea 8374 Morocco 6373 Ukraine 6170 Czech Republic 5951 Burkina Faso 5580 Romania 4624 Turkey 4618 Fiji 4318 El Salvador 4274 Iran 4161 Arabic 3711 Pakistan 3101 Venezuela 3047 Slovakia 2646 Algeria 2643 Cambodia 2519 Belarus 2329 Thailand 2257 Taiwan 1951 Bosnia and Herzegovina 1719 Sri Lanka 1645 Kenya 1629 Nepal 1550 Latvia 1310 Vietnam 1157 Zimbabwe 1003 Palestine 1000 Lebanon 999 Tunisia 980 Paraguay 929 Nicaragua 921 Montenegro 803 Senegal 788 Uganda 764 Congo 749 Syria 689 Myanmar 684 Uzbekistan 674 Tanzania 624 Namibia 602 Bolivia 564 Guinea 482 Zambia 449 Papua New Guinea 363 Liberia 304 Sierra Leone 286 Bahrain 279 Korea 265 Libya 241 Togo 233 Benin 231 Rwanda 228 Mozambique 216 Yemen 199 Somalia 169 South Sudan 75 Syrian Arab Republic 7 Turkiye 4 Name: count, dtype: int64
In [14]:
Copied!
df["country"] = df["country"].replace(
{
"Turkiye": "Turkey",
"Syrian Arab Republic": "Syria",
"Korea, Republic of": "Korea",
"Taiwan, Province of China": "Taiwan",
"Palestine, State of": "Palestine",
"Iran, Islamic Republic of": "Iran",
"Bolivia, Plurinational State of": "Bolivia",
"South Korea": "Korea",
"Russian Federation": "Russia",
"United Kingdom": "England",
"Czech Republic": "Czechia",
}
)
df["country"] = df["country"].replace(
{
"Turkiye": "Turkey",
"Syrian Arab Republic": "Syria",
"Korea, Republic of": "Korea",
"Taiwan, Province of China": "Taiwan",
"Palestine, State of": "Palestine",
"Iran, Islamic Republic of": "Iran",
"Bolivia, Plurinational State of": "Bolivia",
"South Korea": "Korea",
"Russian Federation": "Russia",
"United Kingdom": "England",
"Czech Republic": "Czechia",
}
)
In [16]:
Copied!
(df["country"].value_counts() / len(df["country"])).round(4)
(df["country"].value_counts() / len(df["country"])).round(4)
Out[16]:
country Egypt 0.1113 Italy 0.0851 United States 0.0735 Saudi Arabia 0.0714 France 0.0458 Colombia 0.0434 Iraq 0.0418 South Africa 0.0347 Mexico 0.0326 Malaysia 0.0277 Spain 0.0261 England 0.0260 Russia 0.0249 Sudan 0.0236 Nigeria 0.0221 Peru 0.0198 Brazil 0.0196 United Arab Emirates 0.0166 Chile 0.0163 India 0.0153 Germany 0.0143 Netherlands 0.0129 Oman 0.0125 Israel 0.0095 Kuwait 0.0088 Bangladesh 0.0086 Palestine 0.0081 Kazakhstan 0.0080 Jordan 0.0076 Canada 0.0075 Iran 0.0074 Belgium 0.0074 Singapore 0.0069 Bolivia 0.0066 Poland 0.0064 Qatar 0.0062 Hong Kong 0.0059 Argentina 0.0058 Portugal 0.0053 Cameroon 0.0050 Uruguay 0.0037 Panama 0.0037 Switzerland 0.0036 Guatemala 0.0036 Costa Rica 0.0035 Ireland 0.0033 Czechia 0.0032 Finland 0.0031 Austria 0.0029 Sweden 0.0024 Ghana 0.0022 Philippines 0.0021 Mauritius 0.0020 Croatia 0.0015 Denmark 0.0014 Greece 0.0014 China 0.0013 Afghanistan 0.0012 Albania 0.0012 Angola 0.0012 Norway 0.0011 Japan 0.0010 Bulgaria 0.0010 Taiwan 0.0010 Jamaica 0.0009 Hungary 0.0009 Macao 0.0008 Ecuador 0.0007 Botswana 0.0006 Slovenia 0.0005 Lithuania 0.0005 Brunei Darussalam 0.0005 Luxembourg 0.0004 Serbia 0.0004 Puerto Rico 0.0003 Indonesia 0.0003 Malta 0.0003 Cyprus 0.0003 Azerbaijan 0.0002 Georgia 0.0002 Estonia 0.0002 Maldives 0.0002 Moldova, Republic of 0.0001 Korea 0.0001 Iceland 0.0001 Turkmenistan 0.0000 Honduras 0.0000 Burundi 0.0000 Haiti 0.0000 Djibouti 0.0000 Ethiopia 0.0000 Morocco 0.0000 Ukraine 0.0000 Burkina Faso 0.0000 Romania 0.0000 Turkey 0.0000 Fiji 0.0000 El Salvador 0.0000 Arabic 0.0000 Pakistan 0.0000 Venezuela 0.0000 Slovakia 0.0000 Algeria 0.0000 Cambodia 0.0000 Belarus 0.0000 Thailand 0.0000 Bosnia and Herzegovina 0.0000 Sri Lanka 0.0000 Kenya 0.0000 Nepal 0.0000 Latvia 0.0000 Vietnam 0.0000 Zimbabwe 0.0000 Lebanon 0.0000 Tunisia 0.0000 Paraguay 0.0000 Nicaragua 0.0000 Montenegro 0.0000 Senegal 0.0000 Uganda 0.0000 Congo 0.0000 Syria 0.0000 Myanmar 0.0000 Uzbekistan 0.0000 Tanzania 0.0000 Namibia 0.0000 Guinea 0.0000 Zambia 0.0000 Papua New Guinea 0.0000 Liberia 0.0000 Sierra Leone 0.0000 Bahrain 0.0000 Libya 0.0000 Togo 0.0000 Benin 0.0000 Rwanda 0.0000 Mozambique 0.0000 Yemen 0.0000 Somalia 0.0000 South Sudan 0.0000 Name: count, dtype: float64
In [9]:
Copied!
df["surname"].isna().sum()
df["surname"].isna().sum()
Out[9]:
0
In [8]:
Copied!
df["country"].isna().sum()
df["country"].isna().sum()
Out[8]:
0
In [17]:
Copied!
df.to_parquet("data/preprocessed/final_dataset.parquet")
df.to_parquet("data/preprocessed/final_dataset.parquet")