01_preprocessing
In [1]:
Copied!
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
import os
import sys
sys.path.append(os.getcwd())
os.chdir("..")
import pandas as pd
In [104]:
Copied!
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 800)
In [13]:
Copied!
from unidecode import unidecode
from unidecode import unidecode
In [247]:
Copied!
datasets = os.listdir("data/raw/name_dataset/data")
datasets = os.listdir("data/raw/name_dataset/data")
In [277]:
Copied!
df = pd.DataFrame()
for dataset in datasets:
if "csv" in dataset:
df_temp = pd.read_csv(
"data/raw/name_dataset/data/" + dataset,
names=["firstname", "surname", "gender", "country_code"],
)
df_temp = df_temp.dropna()
df_temp["country"] = get_country_name(
df=df_temp, country_name_col="country_code"
)
df_temp = df_temp.drop(columns=["firstname", "gender", "country_code"])
df_temp["surname"] = df_temp["surname"].apply(unidecode)
df = pd.concat([df, df_temp])
df.to_parquet("data/preprocessed/name_dataset.parquet")
df = pd.DataFrame()
for dataset in datasets:
if "csv" in dataset:
df_temp = pd.read_csv(
"data/raw/name_dataset/data/" + dataset,
names=["firstname", "surname", "gender", "country_code"],
)
df_temp = df_temp.dropna()
df_temp["country"] = get_country_name(
df=df_temp, country_name_col="country_code"
)
df_temp = df_temp.drop(columns=["firstname", "gender", "country_code"])
df_temp["surname"] = df_temp["surname"].apply(unidecode)
df = pd.concat([df, df_temp])
df.to_parquet("data/preprocessed/name_dataset.parquet")
In [278]:
Copied!
df.to_parquet("data/preprocessed/name_dataset.parquet")
df.to_parquet("data/preprocessed/name_dataset.parquet")
In [155]:
Copied!
df = pd.read_csv("data/raw/annotated_names_NamePrism.tsv", sep="\t")
df = pd.read_csv("data/raw/annotated_names_NamePrism.tsv", sep="\t")
In [156]:
Copied!
df.head()
df.head()
Out[156]:
id | name | ethnicity | country | |
---|---|---|---|---|
0 | 1 | Marius Aam | Nordic | Norway |
1 | 2 | Aamani | SouthAsian | India |
2 | 3 | Abdelkader Aamara | Muslim | Morocco |
3 | 4 | Jarle Alex Aambø | Nordic | Norway |
4 | 5 | Syed Aamer Ali | Muslim | Pakistan |
In [157]:
Copied!
len(df)
len(df)
Out[157]:
460500
In [158]:
Copied!
df = df.drop(columns=["id", "ethnicity"])
df = df.drop(columns=["id", "ethnicity"])
In [159]:
Copied!
df["name"] = df["name"].apply(unidecode)
df["name"] = df["name"].apply(unidecode)
In [160]:
Copied!
words_to_filter = [
"daughter",
"Prince",
"Chief",
"Reverend",
" with",
"admiral",
"The",
"Count",
" from",
"aka",
"Prof",
"Lieutenant",
"Princess",
"Windysport",
" as",
" is",
" at",
" to",
"nicknamed",
" name",
" by",
" of",
" and",
" in",
"Personal",
"Honour",
"postage",
"Professor",
" Award",
"General",
"Admiral",
"Born",
"Website",
"'",
"\)",
"\(",
"Lady",
"Lord",
]
for word in words_to_filter:
df = df[~df["name"].str.contains(word)]
# filter acronyms
df = df[~df["name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]
words_to_filter = [
"daughter",
"Prince",
"Chief",
"Reverend",
" with",
"admiral",
"The",
"Count",
" from",
"aka",
"Prof",
"Lieutenant",
"Princess",
"Windysport",
" as",
" is",
" at",
" to",
"nicknamed",
" name",
" by",
" of",
" and",
" in",
"Personal",
"Honour",
"postage",
"Professor",
" Award",
"General",
"Admiral",
"Born",
"Website",
"'",
"\)",
"\(",
"Lady",
"Lord",
]
for word in words_to_filter:
df = df[~df["name"].str.contains(word)]
# filter acronyms
df = df[~df["name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]
In [161]:
Copied!
df["surname"] = df["name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
df["surname"] = df["name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
In [143]:
Copied!
len(a)
len(a)
Out[143]:
448898
In [163]:
Copied!
df.drop(columns=["name"]).to_parquet(
"data/preprocessed/annotated_names_NamePrism.parquet"
)
df.drop(columns=["name"]).to_parquet(
"data/preprocessed/annotated_names_NamePrism.parquet"
)
In [164]:
Copied!
df = pd.read_csv("data/raw/surname-nationality.csv")
df = pd.read_csv("data/raw/surname-nationality.csv")
In [167]:
Copied!
df = df[["surname", "nationality"]]
df = df[["surname", "nationality"]]
In [168]:
Copied!
df["nationality"].unique()
df["nationality"].unique()
Out[168]:
array(['Ethiopian', 'Honduran', 'Nigerian', 'Malaysian', 'Chilean', 'Portuguese', 'Papua New Guinean', 'Algerian', 'Brazilian', 'Venezuelan', 'Ukrainian', 'South African', 'Nicaraguan', 'Moroccan', 'Finnish', 'Mexican', 'Palestinian', 'Nepalese', 'Peruvian', 'Dutch', 'Arabic', 'Irish', 'Spanish', 'French', 'German', 'English', 'Korean', 'Indian', 'Vietnamese', 'Scottish', 'Japanese', 'Polish', 'Greek', 'Czech', 'Italian', 'Russian', 'Chinese'], dtype=object)
In [169]:
Copied!
df["country"] = df["nationality"].replace(
{
"Ethiopian": "Ethiopia",
"Honduran": "Honduras",
"Nigerian": "Nigeria",
"Malaysian": "Malaysia",
"Chilean": "Chile",
"Portuguese": "Portugal",
"Papua New Guinean": "Papua New Guinea",
"Algerian": "Algeria",
"Brazilian": "Brazil",
"Venezuelan": "Venezuela",
"Ukrainian": "Ukraine",
"South African": "South Africa",
"Nicaraguan": "Nicaragua",
"Moroccan": "Morocco",
"Finnish": "Finland",
"Mexican": "Mexico",
"Palestinian": "Palestine",
"Nepalese": "Nepal",
"Peruvian": "Peru",
"Dutch": "Netherlands ",
"Arabic": "Arabic",
"Irish": "Ireland",
"Spanish": "Spain",
"French": "France",
"German": "Germany",
"English": "England",
"Korean": " Korea",
"Indian": " India",
"Vietnamese": "Vietnam",
"Scottish": "England",
"Japanese": "Japan",
"Polish": "Poland",
"Greek": "Greece",
"Czech": "Czechia",
"Italian": "Italy",
"Russian": "Russia",
"Chinese": "China",
}
)
df["country"] = df["nationality"].replace(
{
"Ethiopian": "Ethiopia",
"Honduran": "Honduras",
"Nigerian": "Nigeria",
"Malaysian": "Malaysia",
"Chilean": "Chile",
"Portuguese": "Portugal",
"Papua New Guinean": "Papua New Guinea",
"Algerian": "Algeria",
"Brazilian": "Brazil",
"Venezuelan": "Venezuela",
"Ukrainian": "Ukraine",
"South African": "South Africa",
"Nicaraguan": "Nicaragua",
"Moroccan": "Morocco",
"Finnish": "Finland",
"Mexican": "Mexico",
"Palestinian": "Palestine",
"Nepalese": "Nepal",
"Peruvian": "Peru",
"Dutch": "Netherlands ",
"Arabic": "Arabic",
"Irish": "Ireland",
"Spanish": "Spain",
"French": "France",
"German": "Germany",
"English": "England",
"Korean": " Korea",
"Indian": " India",
"Vietnamese": "Vietnam",
"Scottish": "England",
"Japanese": "Japan",
"Polish": "Poland",
"Greek": "Greece",
"Czech": "Czechia",
"Italian": "Italy",
"Russian": "Russia",
"Chinese": "China",
}
)
/tmp/ipykernel_5113/1561251810.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["country"] = df["nationality"].replace({
In [170]:
Copied!
df
df
Out[170]:
surname | nationality | country | |
---|---|---|---|
0 | Tesfaye | Ethiopian | Ethiopia |
1 | Mohammed | Ethiopian | Ethiopia |
2 | Getachew | Ethiopian | Ethiopia |
3 | Abebe | Ethiopian | Ethiopia |
4 | Girma | Ethiopian | Ethiopia |
... | ... | ... | ... |
36236 | Yun | Chinese | China |
36237 | Zhai | Chinese | China |
36238 | Zhi | Chinese | China |
36239 | Zhuan | Chinese | China |
36240 | Zhui | Chinese | China |
36241 rows × 3 columns
In [171]:
Copied!
df.drop(columns=["nationality"]).to_parquet(
"data/preprocessed/surname-nationality.parquet"
)
df.drop(columns=["nationality"]).to_parquet(
"data/preprocessed/surname-nationality.parquet"
)
In [172]:
Copied!
df = pd.read_csv("data/raw/surnames_with_splits.csv")
df = pd.read_csv("data/raw/surnames_with_splits.csv")
In [174]:
Copied!
df = df[["surname", "nationality"]]
df = df[["surname", "nationality"]]
In [175]:
Copied!
df["nationality"].unique()
df["nationality"].unique()
Out[175]:
array(['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese'], dtype=object)
In [176]:
Copied!
df["country"] = df["nationality"].replace(
{
"Arabic": "Arabic",
"Chinese": "China",
"Czech": "Czechia",
"Dutch": "Netherlands ",
"English": "England",
"French": "France",
"German": "Germany",
"Greek": "Greece",
"Irish": "Ireland",
"Italian": "Italy",
"Japanese": "Japan",
"Korean": " Korea",
"Polish": "Poland",
"Portuguese": "Portugal",
"Russian": "Russia",
"Scottish": "England",
"Spanish": "Spain",
"Vietnamese": "Vietnam",
}
)
df["country"] = df["nationality"].replace(
{
"Arabic": "Arabic",
"Chinese": "China",
"Czech": "Czechia",
"Dutch": "Netherlands ",
"English": "England",
"French": "France",
"German": "Germany",
"Greek": "Greece",
"Irish": "Ireland",
"Italian": "Italy",
"Japanese": "Japan",
"Korean": " Korea",
"Polish": "Poland",
"Portuguese": "Portugal",
"Russian": "Russia",
"Scottish": "England",
"Spanish": "Spain",
"Vietnamese": "Vietnam",
}
)
/tmp/ipykernel_5113/3673784748.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df["country"] = df["nationality"].replace({
In [177]:
Copied!
df.drop(columns=["nationality"]).to_parquet(
"data/preprocessed/surnames_with_splits.parquet"
)
df.drop(columns=["nationality"]).to_parquet(
"data/preprocessed/surnames_with_splits.parquet"
)
In [235]:
Copied!
df = pd.read_csv("data/raw/final_all_names_code.csv")
import pycountry
def get_country_name(df: pd.DataFrame, country_name_col: str):
"""Returns pd.Series with mapped country iso_a2 to names.
Args:
df (pd.Series): input country name pandas series
"""
dfc = df.copy()
def _get_country_name_map(country_name: str) -> str:
"""Helper function for get_country_name"""
try:
result = pycountry.countries.get(alpha_2=country_name)
name_str = result.name
except:
name_str = "NA"
return name_str
dfc["res"] = "NA"
for country_name in dfc[country_name_col].unique():
dfc.loc[dfc[country_name_col] == country_name, ["res"]] = _get_country_name_map(
country_name
)
return dfc["res"]
df = pd.read_csv("data/raw/final_all_names_code.csv")
import pycountry
def get_country_name(df: pd.DataFrame, country_name_col: str):
"""Returns pd.Series with mapped country iso_a2 to names.
Args:
df (pd.Series): input country name pandas series
"""
dfc = df.copy()
def _get_country_name_map(country_name: str) -> str:
"""Helper function for get_country_name"""
try:
result = pycountry.countries.get(alpha_2=country_name)
name_str = result.name
except:
name_str = "NA"
return name_str
dfc["res"] = "NA"
for country_name in dfc[country_name_col].unique():
dfc.loc[dfc[country_name_col] == country_name, ["res"]] = _get_country_name_map(
country_name
)
return dfc["res"]
In [236]:
Copied!
df
df
Out[236]:
Name | Country_code | Country | |
---|---|---|---|
0 | Amy Johnson | ar_AE | AE |
1 | Thomas Wright | ar_AE | AE |
2 | Mr. Marco Flores DDS | ar_AE | AE |
3 | Marcus Robbins | ar_AE | AE |
4 | Susan Montgomery | ar_AE | AE |
... | ... | ... | ... |
404057 | Yang Lu | zh_TW | TW |
404058 | Liu Yuxin | zh_TW | TW |
404059 | Wu Wei Ting | zh_TW | TW |
404060 | Li Shuting | zh_TW | TW |
404061 | Sujiarong | zh_TW | TW |
404062 rows × 3 columns
In [237]:
Copied!
df["country"] = get_country_name(df=df, country_name_col="Country")
df["country"] = get_country_name(df=df, country_name_col="Country")
In [238]:
Copied!
len(df[df["country"] == "NA"]), df["country"].unique()
len(df[df["country"] == "NA"]), df["country"].unique()
Out[238]:
(0, array(['United Arab Emirates', 'Egypt', 'Jordan', 'Palestine, State of', 'Saudi Arabia', 'Azerbaijan', 'Bulgaria', 'Bangladesh', 'Bosnia and Herzegovina', 'Czechia', 'Denmark', 'Germany', 'Austria', 'Switzerland', 'Cyprus', 'Greece', 'Australia', 'Canada', 'United Kingdom', 'Ireland', 'India', 'New Zealand', 'Philippines', 'Thailand', 'United States', 'Spain', 'Colombia', 'Mexico', 'Estonia', 'Iran, Islamic Republic of', 'Finland', 'France', 'Israel', 'Croatia', 'Hungary', 'Armenia', 'Indonesia', 'Italy', 'Japan', 'Georgia', 'Korea, Republic of', "Lao People's Democratic Republic", 'Luxembourg', 'Lithuania', 'Latvia', 'Malta', 'Nepal', 'Belgium', 'Netherlands', 'Norway', 'Poland', 'Brazil', 'Portugal', 'Romania', 'Russian Federation', 'Slovakia', 'Slovenia', 'Sweden', 'Türkiye', 'Ghana', 'Ukraine', 'China', 'Taiwan, Province of China'], dtype=object))
In [239]:
Copied!
df = df.drop(columns=["Country", "Country_code"])
df = df.drop(columns=["Country", "Country_code"])
In [240]:
Copied!
df = df.dropna()
df = df.dropna()
In [241]:
Copied!
df["Name"] = df["Name"].str.strip()
words_to_filter = [
"daughter",
"Prince",
"Chief",
"Reverend",
" with",
"admiral",
"The",
"Count",
" from",
"aka",
"Prof",
"Lieutenant",
"Princess",
"Windysport",
" as",
" is",
" at",
" to",
"nicknamed",
" name",
" by",
" of",
" and",
" in",
"Personal",
"Honour",
"postage",
"Professor",
" Award",
"General",
"Admiral",
"Born",
"Website",
"'",
"\)",
"\(",
"Lady",
"Lord",
]
for word in words_to_filter:
df = df[~df["Name"].str.contains(word)]
# filter acronyms
df = df[~df["Name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]
df["surname"] = df["Name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
df["Name"] = df["Name"].str.strip()
words_to_filter = [
"daughter",
"Prince",
"Chief",
"Reverend",
" with",
"admiral",
"The",
"Count",
" from",
"aka",
"Prof",
"Lieutenant",
"Princess",
"Windysport",
" as",
" is",
" at",
" to",
"nicknamed",
" name",
" by",
" of",
" and",
" in",
"Personal",
"Honour",
"postage",
"Professor",
" Award",
"General",
"Admiral",
"Born",
"Website",
"'",
"\)",
"\(",
"Lady",
"Lord",
]
for word in words_to_filter:
df = df[~df["Name"].str.contains(word)]
# filter acronyms
df = df[~df["Name"].str.contains(r"\b[A-Z](?:[&.]?[A-Z])+\b")]
df["surname"] = df["Name"].str.split(" ", expand=True).ffill(axis=1).iloc[:, -1]
In [243]:
Copied!
df[df["country"] == "Slovakia"]
df[df["country"] == "Slovakia"]
Out[243]:
Name | country | surname | |
---|---|---|---|
345072 | Brooke Griffin | Slovakia | Griffin |
345073 | Derrick Gay | Slovakia | Gay |
345074 | James Villegas | Slovakia | Villegas |
345075 | Johnny Reed | Slovakia | Reed |
345076 | Meagan Cooper | Slovakia | Cooper |
... | ... | ... | ... |
348392 | Crystal Frost | Slovakia | Frost |
348393 | Traci Reyes | Slovakia | Reyes |
348394 | Nichole Bowen | Slovakia | Bowen |
348395 | Sarah Gregory | Slovakia | Gregory |
348396 | Cheryl Baker | Slovakia | Baker |
3179 rows × 3 columns
hmm, this looks like a BS dataset!!!
In [190]:
Copied!
df = pd.read_csv("data/raw/name2lang.txt", names=["surname", "nationality"])
df = pd.read_csv("data/raw/name2lang.txt", names=["surname", "nationality"])
In [191]:
Copied!
df["nationality"].unique()
df["nationality"].unique()
Out[191]:
array([' Portuguese', ' Irish', ' Spanish', ' Vietnamese', ' Chinese', ' Greek', ' Czech', ' Dutch', ' Japanese', ' French', ' German', ' Scottish', ' English', ' Russian', 'Russian', ' Polish', ' Arabic', ' Korean', ' Italian'], dtype=object)
In [192]:
Copied!
df["country"] = df["nationality"].replace(
{
" Arabic": "Arabic",
" Chinese": "China",
" Czech": "Czechia",
" Dutch": "Netherlands ",
" English": "England",
" French": "France",
" German": "Germany",
" Greek": "Greece",
" Irish": "Ireland",
" Italian": "Italy",
" Japanese": "Japan",
" Korean": " Korea",
" Polish": "Poland",
" Portuguese": "Portugal",
"Russian": "Russia",
" Russian": "Russia",
" Scottish": "England",
" Spanish": "Spain",
" Vietnamese": "Vietnam",
}
)
df["country"] = df["nationality"].replace(
{
" Arabic": "Arabic",
" Chinese": "China",
" Czech": "Czechia",
" Dutch": "Netherlands ",
" English": "England",
" French": "France",
" German": "Germany",
" Greek": "Greece",
" Irish": "Ireland",
" Italian": "Italy",
" Japanese": "Japan",
" Korean": " Korea",
" Polish": "Poland",
" Portuguese": "Portugal",
"Russian": "Russia",
" Russian": "Russia",
" Scottish": "England",
" Spanish": "Spain",
" Vietnamese": "Vietnam",
}
)
In [193]:
Copied!
df
df
Out[193]:
surname | nationality | country | |
---|---|---|---|
0 | Abreu | Portuguese | Portugal |
1 | Albuquerque | Portuguese | Portugal |
2 | Almeida | Portuguese | Portugal |
3 | Alves | Portuguese | Portugal |
4 | Araujo | Portuguese | Portugal |
... | ... | ... | ... |
20045 | Zappa | Italian | Italy |
20046 | Zeni | Italian | Italy |
20047 | Zini | Italian | Italy |
20048 | Zino | Italian | Italy |
20049 | Zunino | Italian | Italy |
20050 rows × 3 columns
In [194]:
Copied!
df.drop(columns=["nationality"]).to_parquet("data/preprocessed/name2lang.parquet")
df.drop(columns=["nationality"]).to_parquet("data/preprocessed/name2lang.parquet")