Files
self/common/ofacdata/scripts/ofac.ipynb
2024-07-21 22:54:49 +05:30

1176 lines
39 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SDN List : Data Processing"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['ent_num', 'SDN_name', 'SDN_type', 'Program', 'Title', 'Call_Sign',\n",
" 'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner', 'Remarks'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>SDN_type</th>\n",
" <th>Program</th>\n",
" <th>Title</th>\n",
" <th>Call_Sign</th>\n",
" <th>Vess_type</th>\n",
" <th>Tonnage</th>\n",
" <th>GRT</th>\n",
" <th>Vess_flag</th>\n",
" <th>Vess_owner</th>\n",
" <th>Remarks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>36</td>\n",
" <td>AEROCARIBBEAN AIRLINES</td>\n",
" <td>-0-</td>\n",
" <td>CUBA</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>173</td>\n",
" <td>ANGLO-CARIBBEAN CO., LTD.</td>\n",
" <td>-0-</td>\n",
" <td>CUBA</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>306</td>\n",
" <td>BANCO NACIONAL DE CUBA</td>\n",
" <td>-0-</td>\n",
" <td>CUBA</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>a.k.a. 'BNC'.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>424</td>\n",
" <td>BOUTIQUE LA MAISON</td>\n",
" <td>-0-</td>\n",
" <td>CUBA</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>475</td>\n",
" <td>CASA DE CUBA</td>\n",
" <td>-0-</td>\n",
" <td>CUBA</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name SDN_type Program Title Call_Sign \\\n",
"0 36 AEROCARIBBEAN AIRLINES -0- CUBA -0- -0- \n",
"1 173 ANGLO-CARIBBEAN CO., LTD. -0- CUBA -0- -0- \n",
"2 306 BANCO NACIONAL DE CUBA -0- CUBA -0- -0- \n",
"3 424 BOUTIQUE LA MAISON -0- CUBA -0- -0- \n",
"4 475 CASA DE CUBA -0- CUBA -0- -0- \n",
"\n",
" Vess_type Tonnage GRT Vess_flag Vess_owner Remarks \n",
"0 -0- -0- -0- -0- -0- -0- \n",
"1 -0- -0- -0- -0- -0- -0- \n",
"2 -0- -0- -0- -0- -0- a.k.a. 'BNC'. \n",
"3 -0- -0- -0- -0- -0- -0- \n",
"4 -0- -0- -0- -0- -0- -0- "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import json\n",
"import re\n",
"\n",
"file_path = '../original/sdn.csv'\n",
"df1 = pd.read_csv(file_path)\n",
"print(df1.columns)\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ent_num', 'SDN_name', 'SDN_type', 'Program', 'Title', 'Call_Sign',\n",
" 'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner', 'Remarks',\n",
" 'country'],\n",
" dtype='object')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# COUNTRY\n",
"\n",
"# read add.csv which is the helper file to add the country column to sdn.csv\n",
"add_file_path = '../original/add.csv'\n",
"df2 = pd.read_csv(add_file_path)\n",
"df2 = df2[['ent_num', 'country']]\n",
"if 'country' in df1.columns:\n",
" df1.drop(columns=['country'], inplace=True)\n",
"\n",
"# add a new column country in sdn.csv and add the values from add.csv when ent_num matches then concatanate the values\n",
"merged_df = pd.merge(df1, df2[['ent_num', 'country']], on='ent_num', how='left')\n",
"grouped_df = merged_df.groupby('ent_num')['country'].apply(lambda x: ';'.join(x.dropna())).reset_index()\n",
"result_df = pd.merge(df1, grouped_df, on='ent_num', how='left')\n",
"result_df.columns\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0- 7252\n",
"individual 6915\n",
"vessel 861\n",
"aircraft 374\n",
"Name: SDN_type, dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col = ['Program', 'Title', 'Call_Sign',\n",
" 'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner']\n",
"columns_to_drop = [col_name for col_name in col if col_name in result_df.columns]\n",
"result_df.drop(columns=columns_to_drop, inplace=True)\n",
"result_df['SDN_type'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"49\n"
]
}
],
"source": [
"\n",
"# ETH ADDRESSES \n",
"# TODO : Get bitcoin and othe addresses as well \n",
"pattern = r'ETH\\s+(0x[0-9a-fA-F]{40})(?=[\\s;])'\n",
"def extract_eth_addresses(remark):\n",
" if isinstance(remark, str):\n",
" return re.findall(pattern, remark)\n",
" return []\n",
"\n",
"eth_addresses = result_df['Remarks'].apply(extract_eth_addresses).explode().dropna().tolist()\n",
"eth_addresses_dict = [{'Eth_address': addr} for addr in eth_addresses]\n",
"print(len(eth_addresses))\n",
"json_result = json.dumps(eth_addresses_dict, indent=4)\n",
"with open('eth_addresses.json', 'w') as f:\n",
" f.write(json_result)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cleaned individual 6915\n",
"Name: SDN_type, dtype: int64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>Remarks</th>\n",
" <th>country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>2674</td>\n",
" <td>ABBAS, Abu</td>\n",
" <td>DOB 10 Dec 1948; Director of PALESTINE LIBERAT...</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>2675</td>\n",
" <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
" <td>DOB 03 May 1938; POB Egypt; Chief Ideological ...</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>2676</td>\n",
" <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
" <td>DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2677</td>\n",
" <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
" <td>DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...</td>\n",
" <td>Egypt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>2678</td>\n",
" <td>AWDA, Abd Al Aziz</td>\n",
" <td>DOB 1946; Chief Ideological Figure of PALESTIN...</td>\n",
" <td>-0-</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name \\\n",
"53 2674 ABBAS, Abu \n",
"54 2675 AL RAHMAN, Shaykh Umar Abd \n",
"55 2676 AL ZAWAHIRI, Dr. Ayman \n",
"56 2677 AL-ZOMOR, Abboud Abdul Latif Hassan \n",
"57 2678 AWDA, Abd Al Aziz \n",
"\n",
" Remarks country \n",
"53 DOB 10 Dec 1948; Director of PALESTINE LIBERAT... -0- \n",
"54 DOB 03 May 1938; POB Egypt; Chief Ideological ... -0- \n",
"55 DOB 19 Jun 1951; POB Giza, Egypt; Passport 108... -0- \n",
"56 DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio... Egypt \n",
"57 DOB 1946; Chief Ideological Figure of PALESTIN... -0- "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df = result_df[result_df['SDN_type'] == 'individual']\n",
"print(\"Cleaned\",result_df['SDN_type'].value_counts())\n",
"result_df.drop(columns=\"SDN_type\", inplace=True)\n",
"result_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Individual have proper names and some vessels are right name, but most of them are not. Aircrafts seem to be aircraft manufacturers and have codes in names, hence not of any use. -0- seems to be names of company or groups, hence disregarded."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6915"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(result_df) #total individuals"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6855\n",
"5924\n"
]
}
],
"source": [
"# DOB\n",
"\n",
"# for format dd mmm yyyy\n",
"result_df['DOB'] = result_df['Remarks'].str.extract(r'(\\d{2} \\w{3} \\d{4})')\n",
"result_df['day'] = result_df['DOB'].str.extract(r'(\\d{2})')\n",
"result_df['month'] = result_df['DOB'].str.extract(r'(\\w{3})')\n",
"result_df['year'] = result_df['DOB'].str.extract(r'(\\d{4})')\n",
"# for yyyy only format\n",
"result_df['year'] = result_df['Remarks'].str.extract(r'(\\d{4})')\n",
"result_df.head()\n",
"\n",
"print(result_df['year'].count()) # total individuals with at least year in dob\n",
"print(result_df['DOB'].count()) # total individuals with whole dob\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# GENDER\n",
"def extract_gender(text):\n",
" pattern = r'gender (male|female)'\n",
" match = re.search(pattern, text)\n",
" if match:\n",
" return match.group(1)\n",
" else:\n",
" return None\n",
" \n",
"# Apply the function to extract the gender\n",
"result_df[\"Gender\"] = result_df['Remarks'].apply(extract_gender)\n",
"result_df.head()\n",
"result_df[\"Gender\"].count() \n",
"# result_df[result_df['ent_num'] == \"12610\"]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>Remarks</th>\n",
" <th>country</th>\n",
" <th>DOB</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" <th>Gender</th>\n",
" <th>Citizen</th>\n",
" <th>Nationality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>2674</td>\n",
" <td>ABBAS, Abu</td>\n",
" <td>DOB 10 Dec 1948; Director of PALESTINE LIBERAT...</td>\n",
" <td>-0-</td>\n",
" <td>10 Dec 1948</td>\n",
" <td>10</td>\n",
" <td>Dec</td>\n",
" <td>1948</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>2675</td>\n",
" <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
" <td>DOB 03 May 1938; POB Egypt; Chief Ideological ...</td>\n",
" <td>-0-</td>\n",
" <td>03 May 1938</td>\n",
" <td>03</td>\n",
" <td>May</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>2676</td>\n",
" <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
" <td>DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...</td>\n",
" <td>-0-</td>\n",
" <td>19 Jun 1951</td>\n",
" <td>19</td>\n",
" <td>Jun</td>\n",
" <td>1951</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2677</td>\n",
" <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
" <td>DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...</td>\n",
" <td>Egypt</td>\n",
" <td>19 Apr 1947</td>\n",
" <td>19</td>\n",
" <td>Apr</td>\n",
" <td>1947</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>Egypt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>2678</td>\n",
" <td>AWDA, Abd Al Aziz</td>\n",
" <td>DOB 1946; Chief Ideological Figure of PALESTIN...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1946</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name \\\n",
"53 2674 ABBAS, Abu \n",
"54 2675 AL RAHMAN, Shaykh Umar Abd \n",
"55 2676 AL ZAWAHIRI, Dr. Ayman \n",
"56 2677 AL-ZOMOR, Abboud Abdul Latif Hassan \n",
"57 2678 AWDA, Abd Al Aziz \n",
"\n",
" Remarks country DOB \\\n",
"53 DOB 10 Dec 1948; Director of PALESTINE LIBERAT... -0- 10 Dec 1948 \n",
"54 DOB 03 May 1938; POB Egypt; Chief Ideological ... -0- 03 May 1938 \n",
"55 DOB 19 Jun 1951; POB Giza, Egypt; Passport 108... -0- 19 Jun 1951 \n",
"56 DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio... Egypt 19 Apr 1947 \n",
"57 DOB 1946; Chief Ideological Figure of PALESTIN... -0- NaN \n",
"\n",
" day month year Gender Citizen Nationality \n",
"53 10 Dec 1948 None None None \n",
"54 03 May 1938 None None None \n",
"55 19 Jun 1951 None None None \n",
"56 19 Apr 1947 None None Egypt \n",
"57 NaN NaN 1946 None None None "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_nationality(remark):\n",
" pattern = r'nationality ([A-Za-z]+);'\n",
" match = re.search(pattern, remark)\n",
" if match:\n",
" return match.group(1)\n",
" else:\n",
" return None\n",
"\n",
"def extract_citizen(remark):\n",
" pattern = r'citizen ([A-Za-z]+);'\n",
" match = re.search(pattern, remark)\n",
" if match:\n",
" return match.group(1)\n",
" else:\n",
" return None\n",
"\n",
"\n",
"# Apply the extract_nationality function to the 'remarks' column\n",
"result_df['Citizen'] = result_df['Remarks'].apply(extract_citizen)\n",
"result_df['Nationality'] = result_df['Remarks'].apply(extract_nationality)\n",
"result_df['Nationality'].count()\n",
"\n",
"filtered_df = result_df.dropna(subset=['Citizen', 'Nationality'])\n",
"diff_values_df = filtered_df[filtered_df['Citizen'] != filtered_df['Nationality']]\n",
"count_diff_values = diff_values_df.shape[0]\n",
"print(count_diff_values) # 20 instances where in remark both citizen <country1> and nationality <country2> are mentioned, hence seperated\n",
"result_df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1549"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df['Remarks'] = result_df['Remarks'].str.lower()\n",
"pattern = r'passport ([^\\(]+) \\(([^)]+)\\)'\n",
"\n",
"def extract_passport_info(remark):\n",
" match = re.search(pattern, remark)\n",
" if match:\n",
" return match.group(1), match.group(2)\n",
" else:\n",
" return None, None\n",
"\n",
"\n",
"result_df[['Pass_No', 'Pass_Country']] = result_df['Remarks'].apply(lambda x: pd.Series(extract_passport_info(x)))\n",
"result_df['Pass_No'].count() # total individuals with passport number"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>Remarks</th>\n",
" <th>country</th>\n",
" <th>DOB</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" <th>Gender</th>\n",
" <th>Citizen</th>\n",
" <th>Nationality</th>\n",
" <th>Pass_No</th>\n",
" <th>Pass_Country</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>2674</td>\n",
" <td>ABBAS, Abu</td>\n",
" <td>dob 10 dec 1948; director of palestine liberat...</td>\n",
" <td>-0-</td>\n",
" <td>10 Dec 1948</td>\n",
" <td>10</td>\n",
" <td>Dec</td>\n",
" <td>1948</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>2675</td>\n",
" <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
" <td>dob 03 may 1938; pob egypt; chief ideological ...</td>\n",
" <td>-0-</td>\n",
" <td>03 May 1938</td>\n",
" <td>03</td>\n",
" <td>May</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>2676</td>\n",
" <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
" <td>dob 19 jun 1951; pob giza, egypt; passport 108...</td>\n",
" <td>-0-</td>\n",
" <td>19 Jun 1951</td>\n",
" <td>19</td>\n",
" <td>Jun</td>\n",
" <td>1951</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1084010</td>\n",
" <td>egypt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2677</td>\n",
" <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
" <td>dob 19 apr 1947; pob nahia, giza, egypt; natio...</td>\n",
" <td>Egypt</td>\n",
" <td>19 Apr 1947</td>\n",
" <td>19</td>\n",
" <td>Apr</td>\n",
" <td>1947</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>Egypt</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>2678</td>\n",
" <td>AWDA, Abd Al Aziz</td>\n",
" <td>dob 1946; chief ideological figure of palestin...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1946</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>2679</td>\n",
" <td>FADLALLAH, Shaykh Muhammad Husayn</td>\n",
" <td>dob 1938; alt. dob 1936; pob najf al ashraf (n...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>2681</td>\n",
" <td>HAWATMA, Nayif</td>\n",
" <td>dob 1933; secretary general of democratic fron...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1933</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>2682</td>\n",
" <td>ISLAMBOULI, Mohammad Shawqi</td>\n",
" <td>dob 15 jan 1955; pob egypt; passport 304555 (e...</td>\n",
" <td>-0-</td>\n",
" <td>15 Jan 1955</td>\n",
" <td>15</td>\n",
" <td>Jan</td>\n",
" <td>1955</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>304555</td>\n",
" <td>egypt</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>2683</td>\n",
" <td>JABRIL, Ahmad</td>\n",
" <td>dob 1938; pob ramleh, israel; secretary genera...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>2685</td>\n",
" <td>NAJI, Talal Muhammad Rashid</td>\n",
" <td>dob 1930; pob al nasiria, palestine; principal...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1930</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name \\\n",
"53 2674 ABBAS, Abu \n",
"54 2675 AL RAHMAN, Shaykh Umar Abd \n",
"55 2676 AL ZAWAHIRI, Dr. Ayman \n",
"56 2677 AL-ZOMOR, Abboud Abdul Latif Hassan \n",
"57 2678 AWDA, Abd Al Aziz \n",
"58 2679 FADLALLAH, Shaykh Muhammad Husayn \n",
"59 2681 HAWATMA, Nayif \n",
"60 2682 ISLAMBOULI, Mohammad Shawqi \n",
"61 2683 JABRIL, Ahmad \n",
"62 2685 NAJI, Talal Muhammad Rashid \n",
"\n",
" Remarks country DOB \\\n",
"53 dob 10 dec 1948; director of palestine liberat... -0- 10 Dec 1948 \n",
"54 dob 03 may 1938; pob egypt; chief ideological ... -0- 03 May 1938 \n",
"55 dob 19 jun 1951; pob giza, egypt; passport 108... -0- 19 Jun 1951 \n",
"56 dob 19 apr 1947; pob nahia, giza, egypt; natio... Egypt 19 Apr 1947 \n",
"57 dob 1946; chief ideological figure of palestin... -0- NaN \n",
"58 dob 1938; alt. dob 1936; pob najf al ashraf (n... -0- NaN \n",
"59 dob 1933; secretary general of democratic fron... -0- NaN \n",
"60 dob 15 jan 1955; pob egypt; passport 304555 (e... -0- 15 Jan 1955 \n",
"61 dob 1938; pob ramleh, israel; secretary genera... -0- NaN \n",
"62 dob 1930; pob al nasiria, palestine; principal... -0- NaN \n",
"\n",
" day month year Gender Citizen Nationality Pass_No Pass_Country \n",
"53 10 Dec 1948 None None None None None \n",
"54 03 May 1938 None None None None None \n",
"55 19 Jun 1951 None None None 1084010 egypt \n",
"56 19 Apr 1947 None None Egypt None None \n",
"57 NaN NaN 1946 None None None None None \n",
"58 NaN NaN 1938 None None None None None \n",
"59 NaN NaN 1933 None None None None None \n",
"60 15 Jan 1955 None None None 304555 egypt \n",
"61 NaN NaN 1938 None None None None None \n",
"62 NaN NaN 1930 None None None None None "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ent_num', 'SDN_name', 'Remarks', 'country', 'DOB', 'day', 'month',\n",
" 'year', 'Gender', 'Citizen', 'Nationality', 'Pass_No', 'Pass_Country'],\n",
" dtype='object')"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ent_num</th>\n",
" <th>SDN_name</th>\n",
" <th>Remarks</th>\n",
" <th>country</th>\n",
" <th>DOB</th>\n",
" <th>day</th>\n",
" <th>month</th>\n",
" <th>year</th>\n",
" <th>Gender</th>\n",
" <th>Citizen</th>\n",
" <th>Nationality</th>\n",
" <th>Pass_No</th>\n",
" <th>Pass_Country</th>\n",
" <th>Last_Name</th>\n",
" <th>First_Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>2674</td>\n",
" <td>ABBAS, Abu</td>\n",
" <td>dob 10 dec 1948; director of palestine liberat...</td>\n",
" <td>-0-</td>\n",
" <td>10 Dec 1948</td>\n",
" <td>10</td>\n",
" <td>Dec</td>\n",
" <td>1948</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>ABBAS</td>\n",
" <td>ABU</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>2675</td>\n",
" <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
" <td>dob 03 may 1938; pob egypt; chief ideological ...</td>\n",
" <td>-0-</td>\n",
" <td>03 May 1938</td>\n",
" <td>03</td>\n",
" <td>May</td>\n",
" <td>1938</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AL RAHMAN</td>\n",
" <td>SHAYKH UMAR ABD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>2676</td>\n",
" <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
" <td>dob 19 jun 1951; pob giza, egypt; passport 108...</td>\n",
" <td>-0-</td>\n",
" <td>19 Jun 1951</td>\n",
" <td>19</td>\n",
" <td>Jun</td>\n",
" <td>1951</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1084010</td>\n",
" <td>egypt</td>\n",
" <td>AL ZAWAHIRI</td>\n",
" <td>DR. AYMAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2677</td>\n",
" <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
" <td>dob 19 apr 1947; pob nahia, giza, egypt; natio...</td>\n",
" <td>Egypt</td>\n",
" <td>19 Apr 1947</td>\n",
" <td>19</td>\n",
" <td>Apr</td>\n",
" <td>1947</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>Egypt</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AL-ZOMOR</td>\n",
" <td>ABBOUD ABDUL LATIF HASSAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>2678</td>\n",
" <td>AWDA, Abd Al Aziz</td>\n",
" <td>dob 1946; chief ideological figure of palestin...</td>\n",
" <td>-0-</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1946</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>AWDA</td>\n",
" <td>ABD AL AZIZ</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ent_num SDN_name \\\n",
"53 2674 ABBAS, Abu \n",
"54 2675 AL RAHMAN, Shaykh Umar Abd \n",
"55 2676 AL ZAWAHIRI, Dr. Ayman \n",
"56 2677 AL-ZOMOR, Abboud Abdul Latif Hassan \n",
"57 2678 AWDA, Abd Al Aziz \n",
"\n",
" Remarks country DOB \\\n",
"53 dob 10 dec 1948; director of palestine liberat... -0- 10 Dec 1948 \n",
"54 dob 03 may 1938; pob egypt; chief ideological ... -0- 03 May 1938 \n",
"55 dob 19 jun 1951; pob giza, egypt; passport 108... -0- 19 Jun 1951 \n",
"56 dob 19 apr 1947; pob nahia, giza, egypt; natio... Egypt 19 Apr 1947 \n",
"57 dob 1946; chief ideological figure of palestin... -0- NaN \n",
"\n",
" day month year Gender Citizen Nationality Pass_No Pass_Country \\\n",
"53 10 Dec 1948 None None None None None \n",
"54 03 May 1938 None None None None None \n",
"55 19 Jun 1951 None None None 1084010 egypt \n",
"56 19 Apr 1947 None None Egypt None None \n",
"57 NaN NaN 1946 None None None None None \n",
"\n",
" Last_Name First_Name \n",
"53 ABBAS ABU \n",
"54 AL RAHMAN SHAYKH UMAR ABD \n",
"55 AL ZAWAHIRI DR. AYMAN \n",
"56 AL-ZOMOR ABBOUD ABDUL LATIF HASSAN \n",
"57 AWDA ABD AL AZIZ "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the sdn_name column has in the format \"LAST NAME, first name\", so split it up into 2 columns if you find the comma in the string\n",
"result_df[['Last_Name', 'First_Name']] = result_df['SDN_name'].str.split(', ', expand=True, n=1)\n",
"result_df['Last_Name'] = result_df['Last_Name'].str.upper()\n",
"result_df['First_Name'] = result_df['First_Name'].str.upper()\n",
"result_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv('cleaned_sdn.csv', index=False)\n",
"result_df = result_df.where(pd.notnull(result_df), None)\n",
"\n",
"filtered_df = result_df.dropna(subset=['Pass_No', 'Pass_Country'], how='all')\n",
"passport_df = filtered_df[['Pass_No', 'Pass_Country']]\n",
"name_dob = result_df[['First_Name', 'Last_Name', 'day', 'month', 'year']]\n",
"\n",
"passport_list = passport_df.to_dict(orient='records')\n",
"with open('passports.json', 'w') as json_file:\n",
" json.dump(passport_list, json_file, indent=4)\n",
" \n",
"name_list = name_dob.to_dict(orient='records')\n",
"with open('names.json', 'w') as json_file:\n",
" json.dump(name_list, json_file, indent=4)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}