Monero-Dataset-Pipeline/create_dataset.py

import pickle
from sys import argv
from time import time
from tqdm import tqdm
from gc import collect
from requests import get
from os.path import exists
from itertools import repeat
from statistics import median
from datetime import datetime
from collections import Counter
from sklearn.utils import shuffle
from cherrypicker import CherryPicker  # https://pypi.org/project/cherrypicker/
from requests.exceptions import ConnectionError
from os import walk, getcwd, chdir, listdir, path
from multiprocessing import Pool, cpu_count, Manager
from pandas import DataFrame, concat, options, read_parquet
options.mode.chained_assignment = None  # default='warn'

'''
Description:
Usage: ./create_dataset.py < Wallets Directory Path >
Date: 6/7/2022
Author: ACK-J

Warning: DO NOT run this with a remote node, there are a lot of blockchain lookups and it will be slow!
Warning: Run your own monerod process and block explorer
To run your own block explorer:
    monerod --stagenet                        https://github.com/monero-project/monero
    xmrblocks --stagenet --enable-json-api    https://github.com/moneroexamples/onion-monero-blockchain-explorer

'''

######################
#  Global Variables  #
######################
data = {}  # Key = tx hash, val = dict(transaction metadata)
NUM_PROCESSES = cpu_count()  # Set the number of processes for multiprocessing
NETWORK = "testnet"
API_URL = "https://community.rino.io/explorer/" + NETWORK + "/api"  # Remote Monero Block Explorer
API_URL = "http://127.0.0.1:8081/api"  # Local Monero Block Explorer
NUM_RING_MEMBERS = 11  # DL models depend on a fixed number

# Terminal Colors
red = '\033[31m'
blue = "\033[0;34m"
yellow = "\033[1;33m"
reset = '\033[0m'


def get_xmr_block(block_num):
    return get(API_URL + "/block/" + str(block_num)).json()["data"]


def get_xmr_tx(tx_hash):
    return get(API_URL + "/transaction/" + tx_hash).json()["data"]


def enrich_data(tx_dict_item):
    """

    :param tx_dict_item:
    :return:
    """
    tx_hash = tx_dict_item[0]
    transaction_entry = tx_dict_item[1]
    tx_response = get_xmr_tx(str(tx_hash))
    block_response = get_xmr_block(str(tx_response["block_height"]))
    previous_block_response = get_xmr_block(str(int(tx_response["block_height"]) - 1))
    transaction_entry['Tx_Size'] = tx_response["tx_size"]
    # Check if the fee is missing
    if 'Tx_Fee' not in transaction_entry.keys():
        transaction_entry['Tx_Fee'] = float(tx_response['tx_fee'] * 0.000000000001)  # Converted from piconero to monero
    transaction_entry['Tx_Fee_Per_Byte'] = float(transaction_entry['Tx_Fee']) / int(transaction_entry['Tx_Size'])
    transaction_entry['Num_Confirmations'] = tx_response["confirmations"]
    transaction_entry['Time_Of_Enrichment'] = int(time())
    if tx_response["coinbase"] == "false":
        transaction_entry['Is_Coinbase_Tx'] = False
    elif tx_response["coinbase"] == "true":
        transaction_entry['Is_Coinbase_Tx'] = True
    transaction_entry['Tx_Extra'] = tx_response["extra"]
    transaction_entry['Tx_Extra_Length'] = len(tx_response["extra"])
    transaction_entry['Ring_CT_Type'] = tx_response["rct_type"]
    transaction_entry['Payment_ID'] = tx_response["payment_id"]
    transaction_entry['Payment_ID8'] = tx_response["payment_id8"]

    Total_Block_Tx_Fees = 0
    for tx in block_response["txs"]:
        Total_Block_Tx_Fees += int(tx["tx_fee"])
    transaction_entry['Total_Block_Tx_Fees'] = float(Total_Block_Tx_Fees * 0.000000000001)  # Converted from piconero to monero
    transaction_entry['Block_Size'] = block_response["size"]
    transaction_entry['Time_Since_Last_Block'] = int((datetime.fromtimestamp(int(block_response["timestamp"])) - datetime.fromtimestamp(int(previous_block_response["timestamp"]))).total_seconds())

    #  Output info
    for Decoy in transaction_entry['Outputs']['Decoys_On_Chain']:
        #  Add Temporal Features for the decoy ( This takes up a ton of time )
        #  Retrieve the transaction information about the decoy ring signatures
        decoy_tx_response = get_xmr_tx(str(Decoy['Tx_Hash']))
        #  Iterate through each input
        for decoy_input in decoy_tx_response['inputs']:
            #  Create an entry for the temporal data
            Decoy['Time_Deltas_Between_Ring_Members'] = {}
            #  Make sure there is at least 1 mixin
            if len(decoy_input['mixins']) != 0:
                #  A place to store the block times of each ring member
                Ring_Member_Times = []
                #  Iterate through each mixin, add it to the list and calculate the time deltas
                for member_idx, each_member in enumerate(decoy_input['mixins']):
                    Ring_Member_Times.append(get_xmr_block(str(each_member['block_no']))['timestamp'])
                    #  If the list has at least 2 items
                    if len(Ring_Member_Times) > 1:
                        time_delta = int((datetime.fromtimestamp(Ring_Member_Times[member_idx]) - datetime.fromtimestamp(Ring_Member_Times[member_idx - 1])).total_seconds())
                        Decoy['Time_Deltas_Between_Ring_Members'][str(member_idx - 1) + '_' + str(member_idx)] = time_delta
                        # Add temporal features
                        #  Calculate the total time span of the ring signature ( newest ring on chain block time - oldest ring on chain block time )
                        Decoy['Time_Deltas_Between_Ring_Members']['Total_Decoy_Time_Span'] = int((datetime.fromtimestamp(Ring_Member_Times[len(Ring_Member_Times) - 1]) - datetime.fromtimestamp(Ring_Member_Times[0])).total_seconds())
                        #  Calculate the time between the newest ring in the signature to the block time of the transaction
                        Decoy['Time_Deltas_Between_Ring_Members']['Time_Delta_From_Newest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(Ring_Member_Times[len(Ring_Member_Times) - 1])).total_seconds())
                        #  Calculate the time between the oldest ring in the signature to the block time of the transaction
                        Decoy['Time_Deltas_Between_Ring_Members']['Time_Delta_From_Oldest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(Ring_Member_Times[0])).total_seconds())
                        #  Calculate the mean of the ring time
                        Decoy['Time_Deltas_Between_Ring_Members']['Mean_Ring_Time'] = int(sum(Ring_Member_Times) / len(Ring_Member_Times)) - Ring_Member_Times[0]
                        #  Calculate the median of the ring time
                        Decoy['Time_Deltas_Between_Ring_Members']['Median_Ring_Time'] = int(median(Ring_Member_Times)) - Ring_Member_Times[0]

    #  Add Input Information
    for input_idx, input in enumerate(tx_response['inputs']):
        transaction_entry['Inputs'].append(
            {
                'Amount': input['amount'],
                'Key_Image': input['key_image'],
                'Ring_Members': input['mixins']
            }
        )
        #  Create dictionaries for each of the previous
        transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Outputs'] = {}
        transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Inputs'] = {}
        transaction_entry['Inputs'][input_idx]['Previous_Tx_Time_Deltas'] = {}
        transaction_entry['Inputs'][input_idx]['Previous_Tx_Block_Num_Delta'] = {}
        transaction_entry['Inputs'][input_idx]['Previous_Tx_TxExtra_Len'] = {}
        # transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'] = {}
        # transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'] = {}
        # #  Initialize the occurrences with 0's
        # for each in range(len(input['mixins'])):
        #     transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'][str(each)] = 0
        #     transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'][str(each)] = []

        # Iterate over each ring in the output
        for ring_mem_num, ring in enumerate(input['mixins']):
            prev_tx = get_xmr_tx(ring['tx_hash'])
            #  Get the number of inputs and outputs from the previous transaction involving the mixin
            try:
                num_mixin_outputs = len(prev_tx["outputs"])
            except TypeError as e:  # Edge case where there are no outputs
                num_mixin_outputs = 0
            try:
                num_mixin_inputs = len(prev_tx["inputs"])
            except TypeError as e:  # Edge case where there are no inputs
                num_mixin_inputs = 0
            #  Add the number of outputs to the specific mixin
            transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Outputs'][str(ring_mem_num)] = num_mixin_outputs
            #  Add the number of inputs to the specific mixin
            transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Inputs'][str(ring_mem_num)] = num_mixin_inputs
            #  Find how long it has been from this block to the previous mixin transaction
            transaction_entry['Inputs'][input_idx]['Previous_Tx_Time_Deltas'][str(ring_mem_num)] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(prev_tx['timestamp'])).total_seconds())
            #  Find how many blocks are in between this block and the mixin transaction
            transaction_entry['Inputs'][input_idx]['Previous_Tx_Block_Num_Delta'][str(ring_mem_num)] = int(transaction_entry['Block_Number']) - int(prev_tx['block_height'])
            #  Get the length of the tx_extra from each mixin transaction
            transaction_entry['Inputs'][input_idx]['Previous_Tx_TxExtra_Len'][str(ring_mem_num)] = len(prev_tx['extra'])

            # #  Iterate through each block between where the ring member was created and now
            # for block in range((ring['block_no']+1), transaction_entry['Block_Number']):
            #     #  Get the data for the entire block
            #     temp_block = get_xmr_block(block_cache, str(block))
            #     #  Iterate over each transaction in the block
            #     for tx in temp_block["txs"]:
            #         try:
            #             #  Get the data for each transaction and iterate over the inputs
            #             for each_input in get_xmr_tx(tx_cache, str(tx['tx_hash']))["inputs"]:
            #                 #  For each input iterate over each ring member
            #                 for ring_member in each_input['mixins']:
            #                     #  Check to see if the ring members stealth address matches the current rings
            #                     if ring_member['public_key'] == ring['public_key']:
            #                         transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'][str(ring_mem_num)] += 1
            #                         transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'][str(ring_mem_num)].append(temp_block['timestamp'])
            #         except TypeError as e:  # If there are no inputs
            #             pass

    # Calculate lengths
    transaction_entry['Num_Inputs'] = len(transaction_entry['Inputs'])
    transaction_entry['Num_Outputs'] = len(transaction_entry['Outputs']['Output_Data'])
    transaction_entry['Num_Output_Decoys'] = len(transaction_entry['Outputs']['Decoys_On_Chain'])
    transaction_entry['Block_To_xmr2csv_Time_Delta'] = int((datetime.fromtimestamp(transaction_entry['xmr2csv_Data_Collection_Time']) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())

    # Temporal Features
    if len(transaction_entry['Inputs']) != 0:
        for input_idx, each_input in enumerate(transaction_entry['Inputs']):
            transaction_entry['Inputs'][input_idx]['Time_Deltas_Between_Ring_Members'] = {}
            #  A place to store the block times of each ring member
            ring_mem_times = []
            if len(each_input['Ring_Members']) != 0:
                for ring_num, ring_mem in enumerate(each_input['Ring_Members']):
                    ring_mem_times.append(get_xmr_block(str(ring_mem['block_no']))['timestamp'])
                    #  If the list has at least 2 items
                    if len(ring_mem_times) > 1:
                        time_delta = int((datetime.fromtimestamp(ring_mem_times[ring_num]) - datetime.fromtimestamp(ring_mem_times[ring_num - 1])).total_seconds())
                        transaction_entry['Inputs'][input_idx]['Time_Deltas_Between_Ring_Members'][str(ring_num-1) + '_' + str(ring_num)] = time_delta
            if len(ring_mem_times) > 1:
                # Add temporal features
                #  Calculate the total time span of the ring signature ( the newest ring on chain block time - oldest ring on chain block time )
                transaction_entry['Inputs'][input_idx]['Total_Ring_Time_Span'] = int((datetime.fromtimestamp(ring_mem_times[len(ring_mem_times)-1]) - datetime.fromtimestamp(ring_mem_times[0])).total_seconds())
                #  Calculate the time between the newest ring in the signature to the block time of the transaction
                transaction_entry['Inputs'][input_idx]['Time_Delta_From_Newest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(ring_mem_times[len(ring_mem_times)-1])).total_seconds())
                #  Calculate the time between the oldest ring in the signature to the block time of the transaction
                transaction_entry['Inputs'][input_idx]['Time_Delta_From_Oldest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(ring_mem_times[0])).total_seconds())
                #  Calculate the mean of the ring time
                transaction_entry['Inputs'][input_idx]['Mean_Ring_Time'] = int(sum(ring_mem_times) / len(ring_mem_times)) - ring_mem_times[0]
                #  Calculate the median of the ring time
                transaction_entry['Inputs'][input_idx]['Median_Ring_Time'] = int(median(ring_mem_times)) - ring_mem_times[0]

    # Move labels to Input dictionary (This is kinda jank but it's the best way I can think of)
    for input_key_image, true_ring_position in transaction_entry['Input_True_Rings'].items():
        #  Match the true spent ring's key image to one of the inputs
        for each_input in transaction_entry['Inputs']:
            if each_input['Key_Image'] == input_key_image:
                #  add a field for the input for the true ring spent
                each_input['Ring_no/Ring_size'] = true_ring_position
    #  Delete the temporary dict() holding the true ring positions
    del transaction_entry['Input_True_Rings']

    #  Temporal features for decoys on chain
    transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain'] = {}
    if len(transaction_entry['Outputs']['Decoys_On_Chain']) != 0:
        #  A place to store the block times of each ring member
        decoys_on_chain_times = []
        for member_idx, each_member in enumerate(transaction_entry['Outputs']['Decoys_On_Chain']):
            decoys_on_chain_times.append(get_xmr_block(str(each_member['Block_Number']))['timestamp'])
            #  If the list has at least 2 items
            if len(decoys_on_chain_times) > 1:
                time_delta = int((datetime.fromtimestamp(decoys_on_chain_times[member_idx]) - datetime.fromtimestamp(decoys_on_chain_times[member_idx - 1])).total_seconds())
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain'][str(member_idx-1) + '_' + str(member_idx)] = time_delta
                # Add temporal features
                #  Calculate the total time span of the ring signature ( newest ring on chain block time - oldest ring on chain block time )
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Total_Decoy_Time_Span'] = int((datetime.fromtimestamp(decoys_on_chain_times[len(decoys_on_chain_times)-1]) - datetime.fromtimestamp(decoys_on_chain_times[0])).total_seconds())
                #  Calculate the time between the newest ring in the signature to the block time of the transaction
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Time_Delta_From_Newest_Decoy_To_Block'] = int((datetime.fromtimestamp(decoys_on_chain_times[len(decoys_on_chain_times)-1]) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())
                #  Calculate the time between the oldest ring in the signature to the block time of the transaction
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Time_Delta_From_Oldest_Decoy_To_Block'] = int((datetime.fromtimestamp(decoys_on_chain_times[0]) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())
                #  Calculate the mean of the ring time
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Mean_Decoy_Time'] = sum(decoys_on_chain_times) / len(decoys_on_chain_times) - decoys_on_chain_times[0]
                #  Calculate the median of the ring time
                transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Median_Decoy_Time'] = int(median(decoys_on_chain_times)) - decoys_on_chain_times[0]
    return tx_hash, transaction_entry


def combine_files(Wallet_info):
    """

    :param Wallet_info:
    :return:
    """
    Wallet_addr = Wallet_info[0]
    Wallet_dir = Wallet_info[1]
    #  CSV HEADER -> "block, direction, unlocked, timestamp, amount, running balance, hash, payment ID, fee, destination, amount, index, note"
    #                   0        1          2         3         4           5           6        7       8         9        10      11    12
    wallet_tx_data = {}

    #  Do some error checking, make sure the file exists
    if exists(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv"):
        #  Open the file and get the number of lines
        with open(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv", "r") as f:
            #  If the file only has 1 line than it's just the csv header and the wallet had no transactions
            if len(f.readlines()) > 1:
                # If there is transactions open the file and start parsing
                with open(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv", "r") as fp:
                    next(fp)  # Skip header of csv
                    for line in fp:
                        cli_csv_values = line.split(",")
                        if cli_csv_values[1].strip() == "out":  # Only add outgoing transactions to the dataset
                            #  Check if the hash is a key in the dataset
                            if cli_csv_values[6].strip() not in wallet_tx_data.keys():
                                transaction = {}
                                transaction['Block_Number'] = int(cli_csv_values[0].strip())
                                transaction['Direction'] = cli_csv_values[1].strip()
                                transaction['Block_Timestamp'] = cli_csv_values[3].strip()
                                # Convert timestamp to epoch time
                                p = "%Y-%m-%d %H:%M:%S"
                                epoch = datetime(1970, 1, 1)
                                transaction['Block_Timestamp_Epoch'] = int((datetime.strptime(transaction['Block_Timestamp'].strip(), p) - epoch).total_seconds())

                                transaction['Amount'] = float(cli_csv_values[4].strip())
                                transaction['Wallet_Balance'] = float(cli_csv_values[5].strip())
                                transaction['Tx_Fee'] = float(cli_csv_values[8].strip())
                                transaction['Destination_Address'] = cli_csv_values[9].strip()
                                transaction['Sender_Address'] = Wallet_addr
                                transaction['Network'] = NETWORK

                                transaction['Outputs'] = {}
                                transaction['Outputs']['Output_Data'] = list()
                                transaction['Outputs']['Decoys_On_Chain'] = []
                                transaction['Inputs'] = []

                                #  Add the time that xmr2csv was run
                                with open(Wallet_dir + "/xmr2csv_start_time_" + Wallet_addr + ".csv", "r") as fp2:
                                    for line2 in fp2:
                                        transaction['xmr2csv_Data_Collection_Time'] = int(line2.strip())
                                        break
                                #  Add the transaction
                                wallet_tx_data[cli_csv_values[6].strip()] = transaction

                #  CSV HEADER -> "Timestamp,Block_no,Tx_hash,Tx_public_key,Tx_version,Payment_id,Out_idx,Amount,Output_pub_key,Output_key_img,Output_spend"
                #                    0         1        2           3           4          5        6       7         8               9             10
                with open(Wallet_dir + "/xmr_report_" + Wallet_addr + ".csv", "r") as fp:
                    next(fp)  # Skip header of csv
                    for line in fp:
                        xmr2csv_report_csv_values = line.split(",")
                        tx_hash = xmr2csv_report_csv_values[2].strip()
                        #  Check if the tx hash is in the dataset yet
                        if tx_hash in wallet_tx_data.keys():
                            wallet_tx_data[tx_hash]['Tx_Version'] = float(xmr2csv_report_csv_values[4].strip())
                            wallet_tx_data[tx_hash]['Tx_Public_Key'] = xmr2csv_report_csv_values[3].strip()
                            wallet_tx_data[tx_hash]['Output_Pub_Key'] = xmr2csv_report_csv_values[8].strip()
                            wallet_tx_data[tx_hash]['Output_Key_Img'] = xmr2csv_report_csv_values[9].strip()
                            wallet_tx_data[tx_hash]['Out_idx'] = int(xmr2csv_report_csv_values[6].strip())
                            wallet_tx_data[tx_hash]['Wallet_Output_Number_Spent'] = int(xmr2csv_report_csv_values[10].strip())
                            #  Add Output Information
                            output_info = get(API_URL + "/transaction/" + str(tx_hash)).json()["data"]['outputs']
                            for output_idx, output in enumerate(output_info):
                                wallet_tx_data[tx_hash]['Outputs']['Output_Data'].append({'Amount': output['amount'], 'Stealth_Address': output['public_key']})

                            #  Open the file that has the timestamp from when the data was collected
                            with open(Wallet_dir + "/xmr2csv_start_time_" + Wallet_addr + ".csv", "r") as fp2:
                                for line2 in fp2:
                                    wallet_tx_data[tx_hash]['xmr2csv_Data_Collection_Time'] = int(line2.strip())
                                    break

                            #  Search through the export of all ring member occurrences on chain to see if our output public key was used
                            #  CSV HEADERS -> "Timestamp, Block_no, Decoy_Tx_hash, Output_pub_key, Key_image, ring_no/ring_size"
                            #                      0          1            2               3            4            5
                            with open(Wallet_dir + "/xmr_report_ring_members_" + Wallet_addr + ".csv", "r") as fp2:
                                next(fp2)  # Skip header of csv
                                for line2 in fp2:
                                    ring_members_csv_values = line2.split(",")
                                    Ring_Member = {}
                                    #  Iterate through each output from the transaction
                                    for tx_output in wallet_tx_data[tx_hash]['Outputs']['Output_Data']:
                                        #  Check if the ring members public key matches an output in this transaction
                                        if tx_output['Stealth_Address'] == ring_members_csv_values[3].strip():
                                            Ring_Member['Output_Pub_Key'] = ring_members_csv_values[3].strip()
                                            Ring_Member['Block_Number'] = int(ring_members_csv_values[1].strip())
                                            # Convert timestamp to epoch time before saving
                                            #  https://stackoverflow.com/questions/30468371/how-to-convert-python-timestamp-string-to-epoch
                                            p = "%Y-%m-%d %H:%M:%S"
                                            epoch = datetime(1970, 1, 1)
                                            ring_member_epoch_time = int((datetime.strptime(ring_members_csv_values[0].strip(), p) - epoch).total_seconds())
                                            Ring_Member['Block_Timestamp'] = ring_member_epoch_time
                                            Ring_Member['Key_image'] = ring_members_csv_values[4].strip()
                                            Ring_Member['Tx_Hash'] = ring_members_csv_values[2].strip()
                                            Ring_Member['Ring_no/Ring_size'] = ring_members_csv_values[5].strip()
                                            #  Find the relative age of the outputs public key on the chain compared to when xmr2csv was ran
                                            #  The time from when the data was collected minus the decoy block timestamp
                                            Ring_Member['Ring_Member_Relative_Age'] = int((datetime.fromtimestamp(wallet_tx_data[tx_hash]['xmr2csv_Data_Collection_Time']) - datetime.fromtimestamp(Ring_Member['Block_Timestamp'])).total_seconds())

                                            #  CSV HEADERS -> "Output_pub_key, Frequency, Ring_size"
                                            #                         0            1           2
                                            with open(Wallet_dir + "/xmr_report_ring_members_freq_" + Wallet_addr + ".csv", "r") as fp3:
                                                next(fp3)  # Skip header of csv
                                                for line3 in fp3:
                                                    ring_member_freq_csv_values = line3.split(",")
                                                    #  Check if the ring members public key matches the current public key
                                                    if wallet_tx_data[tx_hash]['Output_Pub_Key'] == ring_member_freq_csv_values[0].strip():
                                                        #  Add the amount of times it has been seen on chain
                                                        Ring_Member['Ring_Member_Freq'] = int(ring_member_freq_csv_values[1].strip())
                                            wallet_tx_data[tx_hash]['Outputs']['Decoys_On_Chain'].append(Ring_Member)
                                    #  Only collect 10 decoys found on chain because it gets too resource intensive when
                                    #  calculating all the temporal features for every decoy's ring signatures
                                    if len(wallet_tx_data[tx_hash]['Outputs']['Decoys_On_Chain']) >= 10:
                                        break

                #  CSV HEADERS -> "Timestamp, Block_no, Tx_hash, Output_pub_key, Key_image, Ring_no/Ring_size"
                #                      0          1        2            3            4              5
                with open(Wallet_dir + "/xmr_report_outgoing_txs_" + Wallet_addr + ".csv", "r") as fp:
                    next(fp)  # Skip header of csv
                    for line in fp:
                        xmr2csv_outgoing_csv_values = line.split(",")
                        #  Make sure the hash exists in the dataset
                        if xmr2csv_outgoing_csv_values[2].strip() in wallet_tx_data.keys():
                            #  Check if there is a dictionary to keep track of input true spends (labels)
                            if 'Input_True_Rings' not in wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()].keys():
                                wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()]['Input_True_Rings'] = {}
                            #  Set the key image as the dictionary key and 'Ring_no/Ring_size' as the value
                            wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()]['Input_True_Rings'][xmr2csv_outgoing_csv_values[4].strip()] = xmr2csv_outgoing_csv_values[5].strip()
            else:
                print(yellow + "Warning: " + reset + str(Wallet_dir) + " did not contain any transactions!")
    return wallet_tx_data


def discover_wallet_directories(dir_to_search):
    """

    :param dir_to_search:
    :return:
    """
    # ERROR Checking if the directory is empty or not
    try:
        if len(listdir(dir_to_search)) == 0:
            print(red + "Error: {} is an empty directory!".format(dir_to_search) + reset)
            exit(1)
    except FileNotFoundError as e:
        print(red + "Error: {} is a non-existent directory!".format(dir_to_search) + reset)
        exit(1)

    # traverse root directory, and list directories as dirs and files as files
    unique_directories = []
    for root, dirs, files in walk(dir_to_search):
        for name in files:
            #  Find all csv files
            if name.lower().endswith(".csv"):
                #  Find all the unique folders holding csv files
                if root not in unique_directories:
                    unique_directories.append(root)
    cwd = getcwd()  # Set a starting directory

    Wallet_addrs = []
    Wallet_info = []
    #  Go through each directory that has csv files in it
    for idx, dir in tqdm(enumerate(unique_directories), desc="Enumerating Wallet Folders", total=len(unique_directories), colour='blue'):
        chdir(dir)
        #  Iterate over the files in the directory
        for root, dirs, files in walk("."):
            for name in files:  # Get the file name
                #  Get each csv file
                if name.lower().endswith(".csv"):
                    #  Extract the 2 unique wallet addr from the name of the files
                    addr = name[::-1].split(".")[1].split("_")[0][::-1]
                    if addr not in Wallet_addrs:
                        Wallet_info.append([addr, dir])
                        Wallet_addrs.append(addr)
                    #  Dont keep looking if the two wallet addresses are already found
                    if len(Wallet_addrs) == 2:
                        break
        chdir(cwd)
    chdir(cwd)

    del Wallet_addrs  # Not needed anymore
    collect()  # Garbage Collector

    global data  # Import the global database
    total_txs = 0
    num_bad_txs = 0
    pool = Pool(processes=NUM_PROCESSES)  # Multiprocessing pool
    # Multiprocess combining the 6 csv files for each wallet
    for wallet_tx_data in tqdm(pool.imap_unordered(func=combine_files, iterable=Wallet_info), desc="(Multiprocessing) Combining Exported Wallet Files", total=len(Wallet_info), colour='blue'):
        #  Make sure there are transactions in the data before adding it to the dataset
        for tx_hash, tx_data in wallet_tx_data.items():
            if "Input_True_Rings" in tx_data.keys():
                data[tx_hash] = tx_data
                total_txs += 1
            else:
                num_bad_txs += 1
    print("There were " + str(num_bad_txs) + " bad transactions that were deleted out of a total " + str(total_txs) + " transactions!")
    print("The dataset now includes " + str(len(data)) + " transactions.")


def clean_transaction(transaction):
    """
    A transaction from the original dataset contains information not
    necessarily useful for training a machine learning model. This
    information includes cryptographically random strings ( wallet
    addresses, and private keys ) as well as human-readable strings.
    This function will also strip any "deanonymized" features and
    return them in a separate dictionary to be added to the labels.
    :param transaction: A dictionary of transaction information
    :return: A dictionary of labels associated to the inputted transaction
    """
    private_info = {}
    del transaction['Tx_Version']
    del transaction['Block_Number']
    del transaction['Block_Timestamp_Epoch']
    del transaction['Num_Confirmations']
    private_info['True_Ring_Pos'] = {}
    del transaction['Direction']
    del transaction['Block_Timestamp']
    private_info['Tx_Amount'] = transaction['Amount']
    del transaction['Amount']
    private_info['Wallet_Balance'] = transaction['Wallet_Balance']
    del transaction['Wallet_Balance']
    del transaction['Destination_Address']
    del transaction['Sender_Address']
    del transaction['Network']
    del transaction['Outputs']
    # del transaction['Outputs']['Output_Data']
    # del transaction['Outputs']['Decoys_On_Chain']  # TODO NEED TO EXPAND UPON THIS
    for idx, input in enumerate(transaction['Inputs']):
        del input['Key_Image']
        del input['Ring_Members']
        private_info['True_Ring_Pos'][idx] = input['Ring_no/Ring_size']
        del input['Ring_no/Ring_size']
    del transaction['xmr2csv_Data_Collection_Time']
    del transaction['Tx_Public_Key']
    del transaction['Output_Pub_Key']
    del transaction['Output_Key_Img']
    private_info['Out_idx'] = transaction['Out_idx']
    del transaction['Out_idx']
    private_info['Wallet_Output_Number_Spent'] = transaction['Wallet_Output_Number_Spent']
    del transaction['Wallet_Output_Number_Spent']
    del transaction['Payment_ID']
    del transaction['Payment_ID8']
    del transaction['Time_Of_Enrichment']
    del transaction['Tx_Extra']  # TODO NEED TO USE THIS LATER ON
    del transaction['Num_Output_Decoys']  # TODO
    del transaction['Block_To_xmr2csv_Time_Delta']
    return private_info


def create_feature_set(database):
    """
    This function takes in a nested python dictionary dataset, removes
    any entries that would not be a useful feature to a machine learning
    model, flattens the dictionary and converts it to a dataframe. An
    accompanying labels list is also returned.
    :param database: Nested dictionary of Monero transaction metadata
    :return: A pandas dataframe of the input data and a list of labels
    """
    labels = []
    Valid_Transactions = []
    num_errors = 0
    feature_set = dict()
    num_of_valid_txs = 0  # Incrementer which doesn't count invalid txs
    #  Iterate through each tx hash in the database dict
    for idx, tx_hash in tqdm(enumerate(database.keys()), total=len(database), colour='blue', desc="Cleaning Transactions"):
        #  Pass the transaction ( by reference ) to be stripped of non-features and receive the labels back
        try:
            private_info = clean_transaction(database[tx_hash])
        except Exception as e:
            num_errors += 1
            continue  # Dont process the tx and loop
        #  add tx hash to good list
        #Valid_Transactions.append(DataFrame(CherryPicker(database[tx_hash]).flatten(delim='.').get(), index=[idx]))
        #  Flatten each transaction and iterate over each feature
        for k, v in CherryPicker(database[tx_hash]).flatten(delim='.').get().items():
            #  Check if the feature name is not already in the feature set
            if k not in feature_set.keys():
                feature_set[k] = []
                #  add any missing values
                for i in range(num_of_valid_txs-1):
                    feature_set[k].append(-1)
                #  Add it as a new feature
                feature_set[k].append(v)
            else:  # If the feature is already in the feature set
                #  Check if there are any transactions that did not have this feature
                if len(feature_set[k]) < num_of_valid_txs:
                    #  Add -1 for those occurrences
                    for i in range(num_of_valid_txs-len(feature_set[k])-1):
                        feature_set[k].append(-1)
                #  Append the feature
                feature_set[k].append(v)
        num_of_valid_txs += 1
        #  add the labels to the list
        labels.append(private_info)

    print("Number of skipped transactions:", num_errors)
    assert len(labels) != 0
    del database
    collect()  # Garbage Collector
    feature_set_df = DataFrame.from_dict(feature_set, orient='index').transpose()
    feature_set_df.fillna(-1, inplace=True)

    del feature_set
    collect()  # Garbage collector

    #  Sanity check
    assert len(labels) == len(feature_set_df)

    # Combine dataframes together
    # https://www.confessionsofadataguy.com/solving-the-memory-hungry-pandas-concat-problem/
    #feature_set = concat(Valid_Transactions, axis=0).fillna(-1)

    #  Shuffle the data
    feature_set_df, labels = shuffle(feature_set_df, labels)
    feature_set_df, labels = shuffle(feature_set_df, labels)
    feature_set_df, labels = shuffle(feature_set_df, labels)

    #  Reset the indexing after the shuffles
    feature_set_df.reset_index(drop=True, inplace=True)

    #  Replace any Null values with -1
    return feature_set_df, labels


def undersample_processing(y, ns, min_occurrences, occurrences):
    """

    :param y:
    :param ns:
    :param min_occurrences:
    :param occurrences:
    :return:
    """
    undersampled_y = []
    new_X = []
    y_idx, ring_array = y
    #  For each array of ring members iterate over each index
    for ring_array_idx in range(len(ring_array["True_Ring_Pos"])):
        #  Get the true ring position (label) for the current iteration
        ring_pos = int(ring_array["True_Ring_Pos"][ring_array_idx].split("/")[0])
        total_rings = int(ring_array["True_Ring_Pos"][ring_array_idx].split("/")[1])
        #  Check to see if we hit the maximum number of labels for this position and that the
        #  number of ring members is what we expect.
        if occurrences[ring_pos] < min_occurrences and total_rings == NUM_RING_MEMBERS:
            occurrences[ring_pos] = occurrences[ring_pos] + 1
            #  https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
            #  Slice out the row from the dataframe but keep it as a dataframe
            temp_df = ns.X.iloc[[y_idx]]
            #  Go through each column name in the temp dataframe
            for col_name in temp_df.columns:
                #  Check if the column name has data relating to irrelevant ring signatures
                if "Inputs." in col_name and "." + str(ring_array_idx) + "." not in col_name:
                    #  Delete the columns
                    temp_df = temp_df.drop([col_name], axis=1)
                #  Check if the column name is for the current ring signature
                elif "Inputs." in col_name and "." + str(ring_array_idx) + "." in col_name:
                    #  Rename the column such that it doesn't have the .0. or .1. positioning information
                    temp_df.rename(columns={col_name: col_name.replace("Inputs." + str(ring_array_idx) + ".", "Input.")}, inplace=True)
            #  Add to the new X and y dataframes
            new_X.append(temp_df)
            undersampled_y.append(ring_pos)
    return new_X, undersampled_y


def undersample_processing_wrapper(y_X_min_occurrences_Occurrences):
    """

    :param y_X_min_occurrences_Occurrences:
    :return:
    """
    return undersample_processing(*y_X_min_occurrences_Occurrences)


def undersample(X, y):
    """

    :param X:
    :param y:
    :return:
    """
    #  Flatten the ring signature labels into a list
    flattened_true_spend = []
    for ring_array in y:
        for idx, true_ring_pos in ring_array["True_Ring_Pos"].items():
            flattened_true_spend.append(int(true_ring_pos.split("/")[0]))
    #  Reset pandas indexing just incase
    X.reset_index(drop=True, inplace=True)
    #  Count the amount of true labels at each position in the ring signature
    labels_distribution = Counter(flattened_true_spend)

    # Error checking
    try:
        #  Make sure that there are no classes with 0 labels
        assert len(labels_distribution) == NUM_RING_MEMBERS
    except AssertionError as e:
        print(red + "Error: The dataset contains at least one class which has 0 labels!" + reset)
        exit(1)

    #  Find the smallest number of occurrences
    min_occurrences = labels_distribution.most_common()[len(labels_distribution)-1][1]
    print("Undersampling to " + str(min_occurrences) + " transactions per class. A total of " + str(min_occurrences*NUM_RING_MEMBERS) + " transactions.")
    #max_occurrences = labels_distribution.most_common(1)[0][1]

    undersampled_y = []
    new_X = []
    with Manager() as manager:
        # https://stackoverflow.com/questions/19887087/how-to-share-pandas-dataframe-object-between-processes
        ns = manager.Namespace()
        ns.X = X
        #  Create a dictionary for all 11 spots in a ring signature
        occurrences = manager.dict()
        for i in range(NUM_RING_MEMBERS):
            occurrences[i + 1] = 0
        #  Multiprocessing enriching each transaction
        with manager.Pool(processes=NUM_PROCESSES) as pool:
            for result in tqdm(pool.imap_unordered(func=undersample_processing_wrapper,
                                                   iterable=zip(
                                                                list(enumerate(y)),
                                                                repeat(ns, len(y)),
                                                                repeat(min_occurrences, len(y)),
                                                                repeat(occurrences, len(y))
                                                                )
                                                   ),
                               desc="(Multiprocessing) Undersampling Dataset",
                               total=len(y),
                               colour='blue'
                               ):
                subset_new_X = result[0]
                subset_undersampled_y = result[1]
                #  Add to the new X and y dataframes
                new_X = new_X + subset_new_X
                undersampled_y = undersampled_y + subset_undersampled_y

    del X  # Remove the old dataset to save RAM
    collect()  # Garbage collector

    # Combine the list of dataframes together into a single DF
    undersampled_X = concat(new_X, axis=0)
    del new_X
    collect()  # Garbage collector

    # Sometimes there is a race condition where a class will get +1 samples in the class ( most of the time this happens while debugging )
    assert len(undersampled_X) == len(undersampled_y) == (min_occurrences * NUM_RING_MEMBERS)

    # Shuffle the data one last time
    undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
    undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
    undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
    undersampled_X.reset_index(drop=True, inplace=True)
    return undersampled_X, undersampled_y


def main():
    # Error Checking for command line args
    if len(argv) != 2:
        print("Usage Error: ./create_dataset.py < Wallets Directory Path >")
        exit(1)
    try:  # Check to see if the API URL given can be connected
        assert get(API_URL + "/block/1").status_code == 200
    except ConnectionError as e:
        print("Error: " + red + NETWORK + reset + " block explorer located at " + API_URL + " refused connection!")
        exit(1)
    # Configuration alert
    print("The dataset is being collected for the " + blue + NETWORK + reset + " network using " + API_URL + " as a block explorer!")

    ###########################################
    #  Create the dataset from files on disk  #
    ###########################################
    global data
    print(blue + "Opening " + str(argv[1]) + reset + "\n")
    #  Find where the wallets are stored and combine the exported csv files
    discover_wallet_directories(argv[1])

    #  Multiprocessing References
    #  https://leimao.github.io/blog/Python-tqdm-Multiprocessing/
    #  https://thebinarynotes.com/python-multiprocessing/
    #  https://docs.python.org/3/library/multiprocessing.html
    #  https://stackoverflow.com/questions/6832554/multiprocessing-how-do-i-share-a-dict-among-multiple-processes
    with Manager() as manager:
        #  Multiprocessing enriching each transaction
        with manager.Pool(processes=NUM_PROCESSES) as pool:
            for result in tqdm(pool.imap_unordered(func=enrich_data, iterable=list(data.items())), desc="(Multiprocessing) Enriching Transaction Data", total=len(data), colour='blue'):
                tx_hash, transaction_entry = result[0], result[1]  # Unpack the values returned
                data[tx_hash] = transaction_entry  # Set the enriched version of the tx

        #  Save the raw database to disk
        with open("./Dataset_Files/dataset.pkl", "wb") as fp:
            pickle.dump(data, fp)
        print("./Dataset_Files/dataset.pkl written to disk!")

    #################################
    #  Remove Unnecessary Features  #
    #################################
    with open("./Dataset_Files/dataset.pkl", "rb") as fp:
        data = pickle.load(fp)
    #  Feature selection on raw dataset
    X, y = create_feature_set(data)
    del data
    collect()  # Garbage collector

    #  Save data and labels to disk for future AI training
    with open("./Dataset_Files/X.pkl", "wb") as fp:
        pickle.dump(X, fp)
    with open("./Dataset_Files/y.pkl", "wb") as fp:
        pickle.dump(y, fp)
    #  Error checking; labels and data should be the same length
    assert len(X) == len(y)
    print("./Dataset_Files/X.pkl and ./Dataset_Files/y.pkl written to disk!")

    ###################
    #  Undersampling  #
    ###################

    with open("./Dataset_Files/X.pkl", "rb") as fp:
        X = pickle.load(fp)
    with open("./Dataset_Files/y.pkl", "rb") as fp:
        y = pickle.load(fp)

    X_Undersampled, y_Undersampled = undersample(X, y)
    del X
    collect()  # Garbage collector

    with open("./Dataset_Files/X_Undersampled.pkl", "wb") as fp:
        pickle.dump(X_Undersampled, fp)
    with open("./Dataset_Files/y_Undersampled.pkl", "wb") as fp:
        pickle.dump(y_Undersampled, fp)

    print("./Dataset_Files/X_Undersampled.pkl and ./Dataset_Files/y_Undersampled.pkl written to disk!\nFinished")


if __name__ == '__main__':
    try:
        main()
    # Gracefully exits if user hits CTRL + C
    except KeyboardInterrupt as e:
        print("Error: User stopped the script's execution!")
        exit(1)
    except Exception as e:
        import traceback
        print(e)
        print(traceback.print_exc())
        exit(1)