optimise script for names and dob

2026-04-05 03:00:53 -04:00 · 2024-07-11 04:45:25 +05:30
parent 23fb450451
commit dc57ed006a
4 changed files with 55547 additions and 6948 deletions
--- a/common/ofacdata/scripts/ofac.ipynb
+++ b/common/ofacdata/scripts/ofac.ipynb
@@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@@ -152,7 +152,7 @@
       "4      -0-     -0-   -0-       -0-        -0-            -0-   "
      ]
     },
-     "execution_count": 13,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -162,7 +162,7 @@
    "import json\n",
    "import re\n",
    "\n",
-    "file_path = 'sdn.csv'\n",
+    "file_path = '../original/sdn.csv'\n",
    "df1 = pd.read_csv(file_path)\n",
    "print(df1.columns)\n",
    "df1.head()"
@@ -170,7 +170,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -182,7 +182,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -191,7 +191,7 @@
    "# COUNTRY\n",
    "\n",
    "# read add.csv which is the helper file to add the country column to sdn.csv\n",
-    "add_file_path = 'add.csv'\n",
+    "add_file_path = '../original/add.csv'\n",
    "df2 = pd.read_csv(add_file_path)\n",
    "df2 = df2[['ent_num', 'country']]\n",
    "if 'country' in df1.columns:\n",
@@ -206,7 +206,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -215,7 +215,7 @@
       "Index(['ent_num', 'SDN_name', 'SDN_type', 'Remarks', 'country'], dtype='object')"
      ]
     },
-     "execution_count": 15,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -230,22 +230,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "SDN_type\n",
      "-0-           7252\n",
      "individual    6915\n",
      "vessel         861\n",
      "aircraft       374\n",
-      "Name: count, dtype: int64\n",
-      "Cleaned SDN_type\n",
-      "individual    6915\n",
-      "Name: count, dtype: int64\n"
+      "Name: SDN_type, dtype: int64\n",
+      "Cleaned individual    6915\n",
+      "Name: SDN_type, dtype: int64\n"
     ]
    }
   ],
@@ -265,7 +263,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@@ -274,7 +272,7 @@
       "6915"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -285,7 +283,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -316,7 +314,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@@ -325,7 +323,7 @@
       "4325"
      ]
     },
-     "execution_count": 19,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -349,7 +347,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -409,7 +407,7 @@
       "2096    -0-   20 jul 1966  20   jul  1966   None  "
      ]
     },
-     "execution_count": 20,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -420,7 +418,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@@ -462,7 +460,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@@ -471,7 +469,7 @@
       "1549"
      ]
     },
-     "execution_count": 22,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -493,7 +491,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -502,7 +500,7 @@
       "11"
      ]
     },
-     "execution_count": 23,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -526,7 +524,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -538,7 +536,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 39,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -549,23 +547,218 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ent_num</th>\n",
+       "      <th>SDN_name</th>\n",
+       "      <th>Remarks</th>\n",
+       "      <th>country</th>\n",
+       "      <th>DOB</th>\n",
+       "      <th>day</th>\n",
+       "      <th>month</th>\n",
+       "      <th>year</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Citizen</th>\n",
+       "      <th>Nationality</th>\n",
+       "      <th>Pass_No</th>\n",
+       "      <th>Pass_Country</th>\n",
+       "      <th>Eth_address</th>\n",
+       "      <th>Last_Name</th>\n",
+       "      <th>First_Name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>53</th>\n",
+       "      <td>2674</td>\n",
+       "      <td>ABBAS, Abu</td>\n",
+       "      <td>dob 10 dec 1948; director of palestine liberat...</td>\n",
+       "      <td>-0-</td>\n",
+       "      <td>10 dec 1948</td>\n",
+       "      <td>10</td>\n",
+       "      <td>dec</td>\n",
+       "      <td>1948</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>ABBAS</td>\n",
+       "      <td>ABU</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54</th>\n",
+       "      <td>2675</td>\n",
+       "      <td>AL RAHMAN, Shaykh Umar Abd</td>\n",
+       "      <td>dob 03 may 1938; pob egypt; chief ideological ...</td>\n",
+       "      <td>-0-</td>\n",
+       "      <td>03 may 1938</td>\n",
+       "      <td>03</td>\n",
+       "      <td>may</td>\n",
+       "      <td>1938</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>AL RAHMAN</td>\n",
+       "      <td>SHAYKH UMAR ABD</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>55</th>\n",
+       "      <td>2676</td>\n",
+       "      <td>AL ZAWAHIRI, Dr. Ayman</td>\n",
+       "      <td>dob 19 jun 1951; pob giza, egypt; passport 108...</td>\n",
+       "      <td>-0-</td>\n",
+       "      <td>19 jun 1951</td>\n",
+       "      <td>19</td>\n",
+       "      <td>jun</td>\n",
+       "      <td>1951</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1084010</td>\n",
+       "      <td>egypt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>AL ZAWAHIRI</td>\n",
+       "      <td>DR. AYMAN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>56</th>\n",
+       "      <td>2677</td>\n",
+       "      <td>AL-ZOMOR, Abboud Abdul Latif Hassan</td>\n",
+       "      <td>dob 19 apr 1947; pob nahia, giza, egypt; natio...</td>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>19 apr 1947</td>\n",
+       "      <td>19</td>\n",
+       "      <td>apr</td>\n",
+       "      <td>1947</td>\n",
+       "      <td>male</td>\n",
+       "      <td>None</td>\n",
+       "      <td>egypt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>AL-ZOMOR</td>\n",
+       "      <td>ABBOUD ABDUL LATIF HASSAN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>57</th>\n",
+       "      <td>2678</td>\n",
+       "      <td>AWDA, Abd Al Aziz</td>\n",
+       "      <td>dob 1946; chief ideological figure of palestin...</td>\n",
+       "      <td>-0-</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1946</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>AWDA</td>\n",
+       "      <td>ABD AL AZIZ</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ent_num                             SDN_name  \\\n",
+       "53    2674                           ABBAS, Abu   \n",
+       "54    2675           AL RAHMAN, Shaykh Umar Abd   \n",
+       "55    2676               AL ZAWAHIRI, Dr. Ayman   \n",
+       "56    2677  AL-ZOMOR, Abboud Abdul Latif Hassan   \n",
+       "57    2678                    AWDA, Abd Al Aziz   \n",
+       "\n",
+       "                                              Remarks country          DOB  \\\n",
+       "53  dob 10 dec 1948; director of palestine liberat...    -0-   10 dec 1948   \n",
+       "54  dob 03 may 1938; pob egypt; chief ideological ...    -0-   03 may 1938   \n",
+       "55  dob 19 jun 1951; pob giza, egypt; passport 108...    -0-   19 jun 1951   \n",
+       "56  dob 19 apr 1947; pob nahia, giza, egypt; natio...   Egypt  19 apr 1947   \n",
+       "57  dob 1946; chief ideological figure of palestin...    -0-           NaN   \n",
+       "\n",
+       "    day month  year Gender Citizen Nationality  Pass_No Pass_Country  \\\n",
+       "53   10   dec  1948   None    None        None     None         None   \n",
+       "54   03   may  1938   None    None        None     None         None   \n",
+       "55   19   jun  1951   None    None        None  1084010        egypt   \n",
+       "56   19   apr  1947   male    None       egypt     None         None   \n",
+       "57  NaN   NaN  1946   None    None        None     None         None   \n",
+       "\n",
+       "   Eth_address    Last_Name                 First_Name  \n",
+       "53        None        ABBAS                        ABU  \n",
+       "54        None    AL RAHMAN            SHAYKH UMAR ABD  \n",
+       "55        None  AL ZAWAHIRI                  DR. AYMAN  \n",
+       "56        None     AL-ZOMOR  ABBOUD ABDUL LATIF HASSAN  \n",
+       "57        None         AWDA                ABD AL AZIZ  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# the sdn_name column has in the format \"LAST NAME, first name\", so split it up into 2 columns if you find the comma in the string\n",
+    "result_df[['Last_Name', 'First_Name']] = result_df['SDN_name'].str.split(', ', expand=True, n=1)\n",
+    "result_df['Last_Name'] = result_df['Last_Name'].str.upper()\n",
+    "result_df['First_Name'] = result_df['First_Name'].str.upper()\n",
+    "\n",
+    "result_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df.to_csv('cleaned_sdn.csv', index=False)\n",
+    "result_df = result_df.where(pd.notnull(result_df), None)\n",
    "\n",
    "filtered_df = result_df.dropna(subset=['Pass_No', 'Pass_Country'], how='all')\n",
    "passport_df = filtered_df[['Pass_No', 'Pass_Country']]\n",
    "filtered_df = result_df.dropna(subset=['Eth_address'], how='all')\n",
    "eth_df = filtered_df[['Eth_address']]\n",
+    "name_dob = result_df[['First_Name', 'Last_Name', 'day', 'month', 'year']]\n",
    "\n",
    "passport_list = passport_df.to_dict(orient='records')\n",
    "with open('passports.json', 'w') as json_file:\n",
    "    json.dump(passport_list, json_file, indent=4)\n",
+    "\n",
    "etherum_list = eth_df.to_dict(orient='records')\n",
    "with open('etherum_add.json','w') as json_file:\n",
-    "    json.dump(etherum_list, json_file, indent=4)\n"
+    "    json.dump(etherum_list, json_file, indent=4)\n",
+    "\n",
+    "name_list = name_dob.to_dict(orient='records')\n",
+    "with open('names.json', 'w') as json_file:\n",
+    "    json.dump(name_list, json_file, indent=4)"
   ]
  }
 ],
@@ -585,7 +778,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.9.12"
  }
 },
 "nbformat": 4,