optimise script for names and dob

This commit is contained in:
ashpect
2024-07-11 04:45:25 +05:30
parent 23fb450451
commit dc57ed006a
4 changed files with 55547 additions and 6948 deletions

View File

@@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -152,7 +152,7 @@
"4 -0- -0- -0- -0- -0- -0- "
]
},
"execution_count": 13,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -162,7 +162,7 @@
"import json\n",
"import re\n",
"\n",
"file_path = 'sdn.csv'\n",
"file_path = '../original/sdn.csv'\n",
"df1 = pd.read_csv(file_path)\n",
"print(df1.columns)\n",
"df1.head()"
@@ -170,7 +170,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -182,7 +182,7 @@
" dtype='object')"
]
},
"execution_count": 14,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -191,7 +191,7 @@
"# COUNTRY\n",
"\n",
"# read add.csv which is the helper file to add the country column to sdn.csv\n",
"add_file_path = 'add.csv'\n",
"add_file_path = '../original/add.csv'\n",
"df2 = pd.read_csv(add_file_path)\n",
"df2 = df2[['ent_num', 'country']]\n",
"if 'country' in df1.columns:\n",
@@ -206,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -215,7 +215,7 @@
"Index(['ent_num', 'SDN_name', 'SDN_type', 'Remarks', 'country'], dtype='object')"
]
},
"execution_count": 15,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -230,22 +230,20 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SDN_type\n",
"-0- 7252\n",
"individual 6915\n",
"vessel 861\n",
"aircraft 374\n",
"Name: count, dtype: int64\n",
"Cleaned SDN_type\n",
"individual 6915\n",
"Name: count, dtype: int64\n"
"Name: SDN_type, dtype: int64\n",
"Cleaned individual 6915\n",
"Name: SDN_type, dtype: int64\n"
]
}
],
@@ -265,7 +263,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -274,7 +272,7 @@
"6915"
]
},
"execution_count": 17,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -285,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -316,7 +314,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -325,7 +323,7 @@
"4325"
]
},
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -349,7 +347,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -409,7 +407,7 @@
"2096 -0- 20 jul 1966 20 jul 1966 None "
]
},
"execution_count": 20,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -420,7 +418,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -462,7 +460,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -471,7 +469,7 @@
"1549"
]
},
"execution_count": 22,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -493,7 +491,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -502,7 +500,7 @@
"11"
]
},
"execution_count": 23,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -526,7 +524,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -538,7 +536,7 @@
" dtype='object')"
]
},
"execution_count": 39,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -549,23 +547,218 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>Remarks</th>\n",
" <th>country</th>\n",
" <th>DOB</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" <th>Gender</th>\n",
" <th>Citizen</th>\n",
" <th>Nationality</th>\n",
" <th>Pass_No</th>\n",
" <th>Pass_Country</th>\n",
" <th>Eth_address</th>\n",
" <th>Last_Name</th>\n",
" <th>First_Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>2674</td>\n",
" <td>ABBAS, Abu</td>\n",
" <td>dob 10 dec 1948; director of palestine liberat...</td>\n",
" <td>-0-</td>\n",
" <td>10 dec 1948</td>\n",
" <td>10</td>\n",
" <td>dec</td>\n",
" <td>1948</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>ABBAS</td>\n",
" <td>ABU</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>2675</td>\n",
" <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
" <td>dob 03 may 1938; pob egypt; chief ideological ...</td>\n",
" <td>-0-</td>\n",
" <td>03 may 1938</td>\n",
" <td>03</td>\n",
" <td>may</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AL RAHMAN</td>\n",
" <td>SHAYKH UMAR ABD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>2676</td>\n",
" <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
" <td>dob 19 jun 1951; pob giza, egypt; passport 108...</td>\n",
" <td>-0-</td>\n",
" <td>19 jun 1951</td>\n",
" <td>19</td>\n",
" <td>jun</td>\n",
" <td>1951</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1084010</td>\n",
" <td>egypt</td>\n",
" <td>None</td>\n",
" <td>AL ZAWAHIRI</td>\n",
" <td>DR. AYMAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2677</td>\n",
" <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
" <td>dob 19 apr 1947; pob nahia, giza, egypt; natio...</td>\n",
" <td>Egypt</td>\n",
" <td>19 apr 1947</td>\n",
" <td>19</td>\n",
" <td>apr</td>\n",
" <td>1947</td>\n",
" <td>male</td>\n",
" <td>None</td>\n",
" <td>egypt</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AL-ZOMOR</td>\n",
" <td>ABBOUD ABDUL LATIF HASSAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>2678</td>\n",
" <td>AWDA, Abd Al Aziz</td>\n",
" <td>dob 1946; chief ideological figure of palestin...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1946</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AWDA</td>\n",
" <td>ABD AL AZIZ</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name \\\n",
"53 2674 ABBAS, Abu \n",
"54 2675 AL RAHMAN, Shaykh Umar Abd \n",
"55 2676 AL ZAWAHIRI, Dr. Ayman \n",
"56 2677 AL-ZOMOR, Abboud Abdul Latif Hassan \n",
"57 2678 AWDA, Abd Al Aziz \n",
"\n",
" Remarks country DOB \\\n",
"53 dob 10 dec 1948; director of palestine liberat... -0- 10 dec 1948 \n",
"54 dob 03 may 1938; pob egypt; chief ideological ... -0- 03 may 1938 \n",
"55 dob 19 jun 1951; pob giza, egypt; passport 108... -0- 19 jun 1951 \n",
"56 dob 19 apr 1947; pob nahia, giza, egypt; natio... Egypt 19 apr 1947 \n",
"57 dob 1946; chief ideological figure of palestin... -0- NaN \n",
"\n",
" day month year Gender Citizen Nationality Pass_No Pass_Country \\\n",
"53 10 dec 1948 None None None None None \n",
"54 03 may 1938 None None None None None \n",
"55 19 jun 1951 None None None 1084010 egypt \n",
"56 19 apr 1947 male None egypt None None \n",
"57 NaN NaN 1946 None None None None None \n",
"\n",
" Eth_address Last_Name First_Name \n",
"53 None ABBAS ABU \n",
"54 None AL RAHMAN SHAYKH UMAR ABD \n",
"55 None AL ZAWAHIRI DR. AYMAN \n",
"56 None AL-ZOMOR ABBOUD ABDUL LATIF HASSAN \n",
"57 None AWDA ABD AL AZIZ "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the sdn_name column has in the format \"LAST NAME, first name\", so split it up into 2 columns if you find the comma in the string\n",
"result_df[['Last_Name', 'First_Name']] = result_df['SDN_name'].str.split(', ', expand=True, n=1)\n",
"result_df['Last_Name'] = result_df['Last_Name'].str.upper()\n",
"result_df['First_Name'] = result_df['First_Name'].str.upper()\n",
"\n",
"result_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv('cleaned_sdn.csv', index=False)\n",
"result_df = result_df.where(pd.notnull(result_df), None)\n",
"\n",
"filtered_df = result_df.dropna(subset=['Pass_No', 'Pass_Country'], how='all')\n",
"passport_df = filtered_df[['Pass_No', 'Pass_Country']]\n",
"filtered_df = result_df.dropna(subset=['Eth_address'], how='all')\n",
"eth_df = filtered_df[['Eth_address']]\n",
"name_dob = result_df[['First_Name', 'Last_Name', 'day', 'month', 'year']]\n",
"\n",
"passport_list = passport_df.to_dict(orient='records')\n",
"with open('passports.json', 'w') as json_file:\n",
" json.dump(passport_list, json_file, indent=4)\n",
"\n",
"etherum_list = eth_df.to_dict(orient='records')\n",
"with open('etherum_add.json','w') as json_file:\n",
" json.dump(etherum_list, json_file, indent=4)\n"
" json.dump(etherum_list, json_file, indent=4)\n",
"\n",
"name_list = name_dob.to_dict(orient='records')\n",
"with open('names.json', 'w') as json_file:\n",
" json.dump(name_list, json_file, indent=4)"
]
}
],
@@ -585,7 +778,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.9.12"
}
},
"nbformat": 4,