02_preprocessing_merge

In [1]:

            
                Copied!
                
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
from unidecode import unidecode
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
from unidecode import unidecode

In [3]:

            
                Copied!
                
datasets = os.listdir("data/preprocessed")
datasets
datasets = os.listdir("data/preprocessed")
datasets

Out[3]:

['name_dataset.parquet',
 'surname-nationality.parquet',
 'name2lang.parquet',
 'surnames_with_splits.parquet',
 'annotated_names_NamePrism.parquet']

In [5]:

            
                Copied!
                
                    
                    
                
                

        
df = pd.DataFrame()
for dataset in datasets:
    if "parquet" in dataset:
        df_temp = pd.read_parquet("data/preprocessed/" + dataset)
        df = pd.concat([df, df_temp])

# just in case I forgot somewhere
df["surname"] = df["surname"].apply(unidecode)
df["country"] = df["country"].apply(unidecode)
df["surname"] = df["surname"].str.strip()
df["country"] = df["country"].str.strip()
df = pd.DataFrame()
for dataset in datasets:
    if "parquet" in dataset:
        df_temp = pd.read_parquet("data/preprocessed/" + dataset)
        df = pd.concat([df, df_temp])

# just in case I forgot somewhere
df["surname"] = df["surname"].apply(unidecode)
df["country"] = df["country"].apply(unidecode)
df["surname"] = df["surname"].str.strip()
df["country"] = df["country"].str.strip()

In [12]:

            
                Copied!
                
df["country"].value_counts()
df["country"].value_counts()

country
Egypt                              44127319
Italy                              33761554
United States                      29159238
Saudi Arabia                       28305845
France                             18154084
Colombia                           17230722
Iraq                               16576059
South Africa                       13769164
Mexico                             12927636
Malaysia                           10971884
Spain                              10353376
United Kingdom                     10298417
Russian Federation                  9836825
Sudan                               9363834
Nigeria                             8754524
Peru                                7846736
Brazil                              7778507
United Arab Emirates                6587571
Chile                               6480713
India                               6073900
Germany                             5680128
Netherlands                         5127047
Oman                                4945215
Israel                              3753244
Kuwait                              3484771
Bangladesh                          3412053
Palestine, State of                 3226308
Kazakhstan                          3173225
Jordan                              3032761
Canada                              2957224
Belgium                             2938247
Iran, Islamic Republic of           2935583
Singapore                           2750451
Bolivia, Plurinational State of     2624762
Poland                              2553408
Qatar                               2458339
Hong Kong                           2336948
Argentina                           2283210
Portugal                            2121761
Cameroon                            1973809
Uruguay                             1457692
Panama                              1448629
Switzerland                         1438389
Guatemala                           1414100
Costa Rica                          1399004
Ireland                             1315101
Czechia                             1282157
Finland                             1214714
Austria                             1166345
Sweden                               967551
Ghana                                870351
Philippines                          837888
Mauritius                            798826
Croatia                              602518
Denmark                              564929
Greece                               558120
China                                514114
Afghanistan                          494042
Albania                              474665
Angola                               465585
Norway                               417465
Japan                                408043
Bulgaria                             406336
Taiwan, Province of China            396089
Jamaica                              355419
Hungary                              351434
Macao                                306174
Ecuador                              281985
Botswana                             235195
Slovenia                             209300
Lithuania                            205304
Brunei Darussalam                    198660
Luxembourg                           170945
Serbia                               151913
Puerto Rico                          126177
Indonesia                            116847
Malta                                107925
Cyprus                               107597
Azerbaijan                            91387
Georgia                               85484
Estonia                               83211
Maldives                              81916
Moldova, Republic of                  41864
Russia                                38725
Iceland                               26488
Korea, Republic of                    18788
Turkmenistan                          16043
Honduras                              14413
Burundi                               13961
Haiti                                 13781
Djibouti                              12367
Ethiopia                              12105
England                               10583
South Korea                            8374
Morocco                                6373
Ukraine                                6170
Czech Republic                         5951
Burkina Faso                           5580
Romania                                4624
Turkey                                 4618
Fiji                                   4318
El Salvador                            4274
Iran                                   4161
Arabic                                 3711
Pakistan                               3101
Venezuela                              3047
Slovakia                               2646
Algeria                                2643
Cambodia                               2519
Belarus                                2329
Thailand                               2257
Taiwan                                 1951
Bosnia and Herzegovina                 1719
Sri Lanka                              1645
Kenya                                  1629
Nepal                                  1550
Latvia                                 1310
Vietnam                                1157
Zimbabwe                               1003
Palestine                              1000
Lebanon                                 999
Tunisia                                 980
Paraguay                                929
Nicaragua                               921
Montenegro                              803
Senegal                                 788
Uganda                                  764
Congo                                   749
Syria                                   689
Myanmar                                 684
Uzbekistan                              674
Tanzania                                624
Namibia                                 602
Bolivia                                 564
Guinea                                  482
Zambia                                  449
Papua New Guinea                        363
Liberia                                 304
Sierra Leone                            286
Bahrain                                 279
Korea                                   265
Libya                                   241
Togo                                    233
Benin                                   231
Rwanda                                  228
Mozambique                              216
Yemen                                   199
Somalia                                 169
South Sudan                              75
Syrian Arab Republic                      7
Turkiye                                   4
Name: count, dtype: int64

In [14]:

            
                Copied!
                
                    
                    
                
                

        
df["country"] = df["country"].replace(
    {
        "Turkiye": "Turkey",
        "Syrian Arab Republic": "Syria",
        "Korea, Republic of": "Korea",
        "Taiwan, Province of China": "Taiwan",
        "Palestine, State of": "Palestine",
        "Iran, Islamic Republic of": "Iran",
        "Bolivia, Plurinational State of": "Bolivia",
        "South Korea": "Korea",
        "Russian Federation": "Russia",
        "United Kingdom": "England",
        "Czech Republic": "Czechia",
    }
)
df["country"] = df["country"].replace(
    {
        "Turkiye": "Turkey",
        "Syrian Arab Republic": "Syria",
        "Korea, Republic of": "Korea",
        "Taiwan, Province of China": "Taiwan",
        "Palestine, State of": "Palestine",
        "Iran, Islamic Republic of": "Iran",
        "Bolivia, Plurinational State of": "Bolivia",
        "South Korea": "Korea",
        "Russian Federation": "Russia",
        "United Kingdom": "England",
        "Czech Republic": "Czechia",
    }
)

In [16]:

            
                Copied!
                
(df["country"].value_counts() / len(df["country"])).round(4)
(df["country"].value_counts() / len(df["country"])).round(4)

Out[16]:

country
Egypt                     0.1113
Italy                     0.0851
United States             0.0735
Saudi Arabia              0.0714
France                    0.0458
Colombia                  0.0434
Iraq                      0.0418
South Africa              0.0347
Mexico                    0.0326
Malaysia                  0.0277
Spain                     0.0261
England                   0.0260
Russia                    0.0249
Sudan                     0.0236
Nigeria                   0.0221
Peru                      0.0198
Brazil                    0.0196
United Arab Emirates      0.0166
Chile                     0.0163
India                     0.0153
Germany                   0.0143
Netherlands               0.0129
Oman                      0.0125
Israel                    0.0095
Kuwait                    0.0088
Bangladesh                0.0086
Palestine                 0.0081
Kazakhstan                0.0080
Jordan                    0.0076
Canada                    0.0075
Iran                      0.0074
Belgium                   0.0074
Singapore                 0.0069
Bolivia                   0.0066
Poland                    0.0064
Qatar                     0.0062
Hong Kong                 0.0059
Argentina                 0.0058
Portugal                  0.0053
Cameroon                  0.0050
Uruguay                   0.0037
Panama                    0.0037
Switzerland               0.0036
Guatemala                 0.0036
Costa Rica                0.0035
Ireland                   0.0033
Czechia                   0.0032
Finland                   0.0031
Austria                   0.0029
Sweden                    0.0024
Ghana                     0.0022
Philippines               0.0021
Mauritius                 0.0020
Croatia                   0.0015
Denmark                   0.0014
Greece                    0.0014
China                     0.0013
Afghanistan               0.0012
Albania                   0.0012
Angola                    0.0012
Norway                    0.0011
Japan                     0.0010
Bulgaria                  0.0010
Taiwan                    0.0010
Jamaica                   0.0009
Hungary                   0.0009
Macao                     0.0008
Ecuador                   0.0007
Botswana                  0.0006
Slovenia                  0.0005
Lithuania                 0.0005
Brunei Darussalam         0.0005
Luxembourg                0.0004
Serbia                    0.0004
Puerto Rico               0.0003
Indonesia                 0.0003
Malta                     0.0003
Cyprus                    0.0003
Azerbaijan                0.0002
Georgia                   0.0002
Estonia                   0.0002
Maldives                  0.0002
Moldova, Republic of      0.0001
Korea                     0.0001
Iceland                   0.0001
Turkmenistan              0.0000
Honduras                  0.0000
Burundi                   0.0000
Haiti                     0.0000
Djibouti                  0.0000
Ethiopia                  0.0000
Morocco                   0.0000
Ukraine                   0.0000
Burkina Faso              0.0000
Romania                   0.0000
Turkey                    0.0000
Fiji                      0.0000
El Salvador               0.0000
Arabic                    0.0000
Pakistan                  0.0000
Venezuela                 0.0000
Slovakia                  0.0000
Algeria                   0.0000
Cambodia                  0.0000
Belarus                   0.0000
Thailand                  0.0000
Bosnia and Herzegovina    0.0000
Sri Lanka                 0.0000
Kenya                     0.0000
Nepal                     0.0000
Latvia                    0.0000
Vietnam                   0.0000
Zimbabwe                  0.0000
Lebanon                   0.0000
Tunisia                   0.0000
Paraguay                  0.0000
Nicaragua                 0.0000
Montenegro                0.0000
Senegal                   0.0000
Uganda                    0.0000
Congo                     0.0000
Syria                     0.0000
Myanmar                   0.0000
Uzbekistan                0.0000
Tanzania                  0.0000
Namibia                   0.0000
Guinea                    0.0000
Zambia                    0.0000
Papua New Guinea          0.0000
Liberia                   0.0000
Sierra Leone              0.0000
Bahrain                   0.0000
Libya                     0.0000
Togo                      0.0000
Benin                     0.0000
Rwanda                    0.0000
Mozambique                0.0000
Yemen                     0.0000
Somalia                   0.0000
South Sudan               0.0000
Name: count, dtype: float64

In [9]:

            
                Copied!
                
df["surname"].isna().sum()
df["surname"].isna().sum()

Out[9]:

In [8]:

            
                Copied!
                
df["country"].isna().sum()
df["country"].isna().sum()

Out[8]:

In [17]:

            
                Copied!
                
df.to_parquet("data/preprocessed/final_dataset.parquet")
df.to_parquet("data/preprocessed/final_dataset.parquet")