- added comments

- fixed bug in undersampling rename - improved error checking - drops undersampled columns which provide no value to ML
2026-01-09 13:37:57 -05:00 · 2022-07-08 12:54:47 -04:00
parent 8a412c500a
commit b455246a1d
2 changed files with 46 additions and 20 deletions
--- a/collect.sh
+++ b/collect.sh
@@ -2,7 +2,7 @@

 # Requirements: jq, parallel, xmr2csv, expect, all monero binaries
 # Before running this script first compile and run
-#               "./monerod --testnet"       https://github.com/monero-project/monero#compiling-monero-from-source
+#               "./monerod --stagenet"       https://github.com/monero-project/monero#compiling-monero-from-source
 # Before running this script first compile xmr2csv
 #               also add it to your path    https://github.com/moneroexamples/transactions-export
 # Usage: ./collect.sh
@@ -12,7 +12,7 @@

 # Global variables
 NODE_ADDRESS="127.0.0.1"    # testnet.community.rino.io | stagenet.community.rino.io | node.community.rino.io
-NETWORK="testnet"           # Case-sensitive (make all lowercase) (Options: "testnet", "stagenet", or "mainnet")
+NETWORK="stagenet"           # Case-sensitive (make all lowercase) (Options: "testnet", "stagenet", or "mainnet")
 num_processors=$(nproc --all)

 # TODO LOOK INTO ADDING --ALL-OUTPUTS AND --ALL-KEY-IMAGES
@@ -44,7 +44,7 @@ while read dir; do  # Read in all directories that contain a .keys file in the c
  cd "$dir" || exit
  echo "$dir"
  working_dir=$(pwd)
-  while read walletAddrFile; do # Loop each .txt wallet addr file
+  while read walletAddrFile; do # Loop each .address.txt wallet addr file
    #  Gets the name of the current wallet file
    walletName=$(echo $walletAddrFile | cut -f 2 -d "." | cut -f 2 -d "/")
    #  Gets the address of the current wallet
@@ -157,7 +157,7 @@ EOL

    fi # End error check

-  done < <(find ./ -type f -name "*.txt" | sort -u) #  Find text files in each wallet directory
+  done < <(find ./ -type f -name "*.address.txt" | sort -u) #  Find text files in each wallet directory
  cd - || exit
 #  Find wallet directories that contain a .keys file and only get the parent dirs
 done < <(find . -mindepth 2 -type f -name '*.keys' | sed -r 's|/[^/]+$||' | sort -u )
--- a/create_dataset.py
+++ b/create_dataset.py
@@ -23,7 +23,7 @@ options.mode.chained_assignment = None  # default='warn'
 '''
 Description: 
 Usage: ./create_dataset.py < Wallets Directory Path >
-Date: 7/6/2022
+Date: 7/8/2022
 Author: ACK-J

 Warning: DO NOT run this with a remote node, there are a lot of blockchain lookups and it will be slow!
@@ -71,11 +71,6 @@ def get_xmr_tx(tx_hash):


 def enrich_data(tx_dict_item):
-    """
-
-    :param tx_dict_item:
-    :return:
-    """
    tx_hash = tx_dict_item[0]
    transaction_entry = tx_dict_item[1]

@@ -178,23 +173,34 @@ ORDER BY input_pos, input_mem_idx, height_B ASC

        previous_input_mem = None
        num_decoys_found = 0
+        #  Iterate over the list of results
        for result_idx, occurrence in enumerate(results):
            num_decoys_found += 1
+            #  unzip the list into variables
            Block_A, Block_B, Input_Pos, Input_Member_Idx = occurrence
+            #  Input position and member index start at 1 instead of 0. To keep
+            #  things consistent, subtract one from each.
            Input_Pos -= 1
            Input_Member_Idx -= 1
+            #  If this is the first time running
            if previous_input_mem is None:
+                #  Set the input pos and member index
                previous_input_mem = (Input_Pos, Input_Member_Idx)
+            #  Check if the current input pos and member index is different from the previous
            elif previous_input_mem != (Input_Pos, Input_Member_Idx):
+                #  Get the length of the previous amount of on chain decoys found
                transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['Number_Of_On_Chain_Decoys'] = len(transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['On_Chain_Decoy_Block_Deltas'])
-                assert transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['On_Chain_Decoy_Block_Deltas']["0_" + str(num_decoys_found-1)] == transaction_entry['Block_Number'] - transaction_entry['Inputs'][previous_input_mem[0]]['Ring_Members'][previous_input_mem[1]]['block_no']
+                #assert transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['On_Chain_Decoy_Block_Deltas']["0_" + str(num_decoys_found-1)] == transaction_entry['Block_Number'] - transaction_entry['Inputs'][previous_input_mem[0]]['Ring_Members'][previous_input_mem[1]]['block_no']
                previous_input_mem = (Input_Pos, Input_Member_Idx)
                num_decoys_found = 1
+            #  Edge case where the last result would get skipped
            elif result_idx+1 == len(results):
                transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['Number_Of_On_Chain_Decoys'] = len(transaction_entry['Inputs'][previous_input_mem[0]]['Decoys'][previous_input_mem[1]]['On_Chain_Decoy_Block_Deltas'])
+            #  If the input key does not exist in the dictionary, add it
            if Input_Member_Idx not in transaction_entry['Inputs'][Input_Pos]['Decoys'].keys():
                transaction_entry['Inputs'][Input_Pos]['Decoys'][Input_Member_Idx] = {}
                transaction_entry['Inputs'][Input_Pos]['Decoys'][Input_Member_Idx]['On_Chain_Decoy_Block_Deltas'] = {}
+            #  Calculate the block difference between the first occurrence and the decoy found on chain
            transaction_entry['Inputs'][Input_Pos]['Decoys'][Input_Member_Idx]['On_Chain_Decoy_Block_Deltas']["0_" + str(num_decoys_found)] = Block_B - Block_A


@@ -249,8 +255,6 @@ def combine_files(Wallet_info):
    """
    Wallet_addr = Wallet_info[0]
    Wallet_dir = Wallet_info[1]
-    #  CSV HEADER -> "block, direction, unlocked, timestamp, amount, running balance, hash, payment ID, fee, destination, amount, index, note"
-    #                   0        1          2         3         4           5           6        7       8         9        10      11    12
    wallet_tx_data = {}
    empty_wallet_warning = 0

@@ -261,6 +265,8 @@ def combine_files(Wallet_info):
            #  If the file only has 1 line than it's just the csv header and the wallet had no transactions
            if len(f.readlines()) > 1:
                # If there is transactions open the file and start parsing
+                #  CSV HEADER -> "block, direction, unlocked, timestamp, amount, running balance, hash, payment ID, fee, destination, amount, index, note"
+                #                   0        1          2         3         4           5           6        7       8         9        10      11    12
                with open(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv", "r") as fp:
                    next(fp)  # Skip header of csv
                    for line in fp:
@@ -286,7 +292,6 @@ def combine_files(Wallet_info):

                                transaction['Outputs'] = {}
                                transaction['Outputs']['Output_Data'] = list()
-                                #transaction['Outputs']['Decoys_On_Chain'] = []
                                transaction['Inputs'] = []

                                #  Add the time that xmr2csv was run
@@ -397,7 +402,7 @@ def discover_wallet_directories(dir_to_search):
            for name in files:  # Get the file name
                #  Get each csv file
                if name.lower().endswith(".csv"):
-                    #  Extract the 2 unique wallet addr from the name of the files
+                    #  Extract the 2 unique wallet addresses from the name of the files
                    addr = name[::-1].split(".")[1].split("_")[0][::-1]
                    if addr not in Wallet_addrs:
                        Wallet_info.append([addr, dir])
@@ -560,10 +565,10 @@ def undersample_processing(y, series, min_occurrences, occurrences, predicting):
            occurrences[ring_pos] = occurrences[ring_pos] + 1
            #  https://stackoverflow.com/questions/57392878/how-to-speed-up-pandas-drop-method
            #  Check if the column name has data relating to irrelevant ring signatures and Delete the columns
-            temp_series.drop([column for column in temp_series.index if "Inputs." in column and "." + str(ring_array_idx) + "." not in column], inplace=True)
+            temp_series.drop([column for column in temp_series.index if column.startswith("Inputs.") and not column.startswith("Inputs." + str(ring_array_idx) + ".")], inplace=True)
            #  Check if the column name is for the current ring signature
            #  Rename the column such that it doesn't have the .0. or .1. positioning information
-            temp_series.rename({column: column.replace("Inputs." + str(ring_array_idx) + ".", "Input.") for column in temp_series.index if "Inputs." + str(ring_array_idx) + "." in column}, inplace=True)
+            temp_series.rename({column: column.replace("Inputs." + str(ring_array_idx) + ".", "Input.", 1) for column in temp_series.index if column.startswith("Inputs." + str(ring_array_idx) + ".")}, inplace=True)
            #  Add to the new X and y dataframes
            new_X.append(temp_series)
            new_y.append(ring_pos)
@@ -873,7 +878,16 @@ def validate_data_integrity(X, y, undersampled=False):
    del original_data  # Free up RAM


+def delete_file(list_of_paths):
+    for path in list_of_paths:
+        if exists(path):
+            remove(path)
+
+
 def main():
+    ############################
+    #      Error Checking      #
+    ############################
    # Error Checking for command line args
    if len(argv) != 2:
        print("Usage Error: ./create_dataset.py < Wallets Directory Path >")
@@ -883,20 +897,30 @@ def main():
    except ConnectionError as e:
        print("Error: " + red + NETWORK + reset + " block explorer located at " + API_URL + " refused connection!")
        exit(1)
+
    # Check if the user set up the block explorer correctly
    try:
+        get(API_URL + "/networkinfo")  # For some reason the first request fails sometimes but the second request doesnt
        assert get(API_URL + "/networkinfo").json()["data"] is not None and get(API_URL + "/networkinfo").json()["data"][NETWORK]
    except AssertionError as e:
        print(red + "Error: The block explorer is not configured for " + NETWORK + "!" + reset)
        exit(1)
+
+    #  Check if the postgres database is set up
+    try:
+        _ = psycopg2.connect("host=" + POSTGRES_SQL_HOST + " port=" + POSTGRES_SQL_PORT + " dbname=" + POSTGRES_SQL_DB_NAME + " user=" + POSTGRES_SQL_USERNAME + " password=" + POSTGRES_SQL_PASSWORD)
+    except psycopg2.OperationalError as e:
+        print(red + "ERROR: Connection to PostgresSQL Database Failed!" + reset)
+        exit(1)
+
    # Configuration alert
    print("The dataset is being collected for the " + blue + NETWORK + reset + " network using " + API_URL + " as a block explorer!")

-    if exists("./Dataset_Files/dataset.csv"):  # TODO Add deletion of enumerated files
+    if len(glob("./Dataset_Files/*")) > 0:
        while True:
-            answer = input(blue + "Dataset files exists already. They will be overwritten, Delete them? (y/n) " + reset)
+            answer = input(blue + "Dataset files exists already. Delete all of them? (y/n) " + reset)
            if answer.lower()[0] == "y":
-                remove("./Dataset_Files/dataset.csv")
+                delete_file(glob("./Dataset_Files/*"))
                break
            elif answer.lower()[0] == "n":
                break
@@ -958,6 +982,8 @@ def main():
        y = pickle.load(fp)

    X_Undersampled, y_Undersampled = undersample(X, y, predicting=False)
+    #  Remove any columns which have the same value for every record ( not useful for ML )
+    X_Undersampled.drop(list(X_Undersampled.columns[X_Undersampled.nunique() == 1]), axis=1, inplace=True)
    del X, y
    collect()  # Garbage collector