mirror of
https://github.com/MAGICGrants/Monero-Dataset-Pipeline.git
synced 2026-01-09 13:37:57 -05:00
809 lines
46 KiB
Python
809 lines
46 KiB
Python
import pickle
|
|
from sys import argv
|
|
from time import time
|
|
from tqdm import tqdm
|
|
from gc import collect
|
|
from requests import get
|
|
from os.path import exists
|
|
from itertools import repeat
|
|
from statistics import median
|
|
from datetime import datetime
|
|
from collections import Counter
|
|
from sklearn.utils import shuffle
|
|
from cherrypicker import CherryPicker # https://pypi.org/project/cherrypicker/
|
|
from requests.exceptions import ConnectionError
|
|
from os import walk, getcwd, chdir, listdir, path
|
|
from multiprocessing import Pool, cpu_count, Manager
|
|
from pandas import DataFrame, concat, options, read_parquet
|
|
options.mode.chained_assignment = None # default='warn'
|
|
|
|
'''
|
|
Description:
|
|
Usage: ./create_dataset.py < Wallets Directory Path >
|
|
Date: 6/7/2022
|
|
Author: ACK-J
|
|
|
|
Warning: DO NOT run this with a remote node, there are a lot of blockchain lookups and it will be slow!
|
|
Warning: Run your own monerod process and block explorer
|
|
To run your own block explorer:
|
|
monerod --stagenet https://github.com/monero-project/monero
|
|
xmrblocks --stagenet --enable-json-api https://github.com/moneroexamples/onion-monero-blockchain-explorer
|
|
|
|
'''
|
|
|
|
######################
|
|
# Global Variables #
|
|
######################
|
|
data = {} # Key = tx hash, val = dict(transaction metadata)
|
|
NUM_PROCESSES = cpu_count() # Set the number of processes for multiprocessing
|
|
NETWORK = "testnet"
|
|
API_URL = "https://community.rino.io/explorer/" + NETWORK + "/api" # Remote Monero Block Explorer
|
|
API_URL = "http://127.0.0.1:8081/api" # Local Monero Block Explorer
|
|
NUM_RING_MEMBERS = 11 # DL models depend on a fixed number
|
|
|
|
# Terminal Colors
|
|
red = '\033[31m'
|
|
blue = "\033[0;34m"
|
|
yellow = "\033[1;33m"
|
|
reset = '\033[0m'
|
|
|
|
|
|
def get_xmr_block(block_num):
|
|
return get(API_URL + "/block/" + str(block_num)).json()["data"]
|
|
|
|
|
|
def get_xmr_tx(tx_hash):
|
|
return get(API_URL + "/transaction/" + tx_hash).json()["data"]
|
|
|
|
|
|
def enrich_data(tx_dict_item):
|
|
"""
|
|
|
|
:param tx_dict_item:
|
|
:return:
|
|
"""
|
|
tx_hash = tx_dict_item[0]
|
|
transaction_entry = tx_dict_item[1]
|
|
tx_response = get_xmr_tx(str(tx_hash))
|
|
block_response = get_xmr_block(str(tx_response["block_height"]))
|
|
previous_block_response = get_xmr_block(str(int(tx_response["block_height"]) - 1))
|
|
transaction_entry['Tx_Size'] = tx_response["tx_size"]
|
|
# Check if the fee is missing
|
|
if 'Tx_Fee' not in transaction_entry.keys():
|
|
transaction_entry['Tx_Fee'] = float(tx_response['tx_fee'] * 0.000000000001) # Converted from piconero to monero
|
|
transaction_entry['Tx_Fee_Per_Byte'] = float(transaction_entry['Tx_Fee']) / int(transaction_entry['Tx_Size'])
|
|
transaction_entry['Num_Confirmations'] = tx_response["confirmations"]
|
|
transaction_entry['Time_Of_Enrichment'] = int(time())
|
|
if tx_response["coinbase"] == "false":
|
|
transaction_entry['Is_Coinbase_Tx'] = False
|
|
elif tx_response["coinbase"] == "true":
|
|
transaction_entry['Is_Coinbase_Tx'] = True
|
|
transaction_entry['Tx_Extra'] = tx_response["extra"]
|
|
transaction_entry['Tx_Extra_Length'] = len(tx_response["extra"])
|
|
transaction_entry['Ring_CT_Type'] = tx_response["rct_type"]
|
|
transaction_entry['Payment_ID'] = tx_response["payment_id"]
|
|
transaction_entry['Payment_ID8'] = tx_response["payment_id8"]
|
|
|
|
Total_Block_Tx_Fees = 0
|
|
for tx in block_response["txs"]:
|
|
Total_Block_Tx_Fees += int(tx["tx_fee"])
|
|
transaction_entry['Total_Block_Tx_Fees'] = float(Total_Block_Tx_Fees * 0.000000000001) # Converted from piconero to monero
|
|
transaction_entry['Block_Size'] = block_response["size"]
|
|
transaction_entry['Time_Since_Last_Block'] = int((datetime.fromtimestamp(int(block_response["timestamp"])) - datetime.fromtimestamp(int(previous_block_response["timestamp"]))).total_seconds())
|
|
|
|
# Output info
|
|
for Decoy in transaction_entry['Outputs']['Decoys_On_Chain']:
|
|
# Add Temporal Features for the decoy ( This takes up a ton of time )
|
|
# Retrieve the transaction information about the decoy ring signatures
|
|
decoy_tx_response = get_xmr_tx(str(Decoy['Tx_Hash']))
|
|
# Iterate through each input
|
|
for decoy_input in decoy_tx_response['inputs']:
|
|
# Create an entry for the temporal data
|
|
Decoy['Time_Deltas_Between_Ring_Members'] = {}
|
|
# Make sure there is at least 1 mixin
|
|
if len(decoy_input['mixins']) != 0:
|
|
# A place to store the block times of each ring member
|
|
Ring_Member_Times = []
|
|
# Iterate through each mixin, add it to the list and calculate the time deltas
|
|
for member_idx, each_member in enumerate(decoy_input['mixins']):
|
|
Ring_Member_Times.append(get_xmr_block(str(each_member['block_no']))['timestamp'])
|
|
# If the list has at least 2 items
|
|
if len(Ring_Member_Times) > 1:
|
|
time_delta = int((datetime.fromtimestamp(Ring_Member_Times[member_idx]) - datetime.fromtimestamp(Ring_Member_Times[member_idx - 1])).total_seconds())
|
|
Decoy['Time_Deltas_Between_Ring_Members'][str(member_idx - 1) + '_' + str(member_idx)] = time_delta
|
|
# Add temporal features
|
|
# Calculate the total time span of the ring signature ( newest ring on chain block time - oldest ring on chain block time )
|
|
Decoy['Time_Deltas_Between_Ring_Members']['Total_Decoy_Time_Span'] = int((datetime.fromtimestamp(Ring_Member_Times[len(Ring_Member_Times) - 1]) - datetime.fromtimestamp(Ring_Member_Times[0])).total_seconds())
|
|
# Calculate the time between the newest ring in the signature to the block time of the transaction
|
|
Decoy['Time_Deltas_Between_Ring_Members']['Time_Delta_From_Newest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(Ring_Member_Times[len(Ring_Member_Times) - 1])).total_seconds())
|
|
# Calculate the time between the oldest ring in the signature to the block time of the transaction
|
|
Decoy['Time_Deltas_Between_Ring_Members']['Time_Delta_From_Oldest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(Ring_Member_Times[0])).total_seconds())
|
|
# Calculate the mean of the ring time
|
|
Decoy['Time_Deltas_Between_Ring_Members']['Mean_Ring_Time'] = int(sum(Ring_Member_Times) / len(Ring_Member_Times)) - Ring_Member_Times[0]
|
|
# Calculate the median of the ring time
|
|
Decoy['Time_Deltas_Between_Ring_Members']['Median_Ring_Time'] = int(median(Ring_Member_Times)) - Ring_Member_Times[0]
|
|
|
|
# Add Input Information
|
|
for input_idx, input in enumerate(tx_response['inputs']):
|
|
transaction_entry['Inputs'].append(
|
|
{
|
|
'Amount': input['amount'],
|
|
'Key_Image': input['key_image'],
|
|
'Ring_Members': input['mixins']
|
|
}
|
|
)
|
|
# Create dictionaries for each of the previous
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Outputs'] = {}
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Inputs'] = {}
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Time_Deltas'] = {}
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Block_Num_Delta'] = {}
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_TxExtra_Len'] = {}
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'] = {}
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'] = {}
|
|
# # Initialize the occurrences with 0's
|
|
# for each in range(len(input['mixins'])):
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'][str(each)] = 0
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'][str(each)] = []
|
|
|
|
# Iterate over each ring in the output
|
|
for ring_mem_num, ring in enumerate(input['mixins']):
|
|
prev_tx = get_xmr_tx(ring['tx_hash'])
|
|
# Get the number of inputs and outputs from the previous transaction involving the mixin
|
|
try:
|
|
num_mixin_outputs = len(prev_tx["outputs"])
|
|
except TypeError as e: # Edge case where there are no outputs
|
|
num_mixin_outputs = 0
|
|
try:
|
|
num_mixin_inputs = len(prev_tx["inputs"])
|
|
except TypeError as e: # Edge case where there are no inputs
|
|
num_mixin_inputs = 0
|
|
# Add the number of outputs to the specific mixin
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Outputs'][str(ring_mem_num)] = num_mixin_outputs
|
|
# Add the number of inputs to the specific mixin
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Num_Inputs'][str(ring_mem_num)] = num_mixin_inputs
|
|
# Find how long it has been from this block to the previous mixin transaction
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Time_Deltas'][str(ring_mem_num)] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(prev_tx['timestamp'])).total_seconds())
|
|
# Find how many blocks are in between this block and the mixin transaction
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_Block_Num_Delta'][str(ring_mem_num)] = int(transaction_entry['Block_Number']) - int(prev_tx['block_height'])
|
|
# Get the length of the tx_extra from each mixin transaction
|
|
transaction_entry['Inputs'][input_idx]['Previous_Tx_TxExtra_Len'][str(ring_mem_num)] = len(prev_tx['extra'])
|
|
|
|
# # Iterate through each block between where the ring member was created and now
|
|
# for block in range((ring['block_no']+1), transaction_entry['Block_Number']):
|
|
# # Get the data for the entire block
|
|
# temp_block = get_xmr_block(block_cache, str(block))
|
|
# # Iterate over each transaction in the block
|
|
# for tx in temp_block["txs"]:
|
|
# try:
|
|
# # Get the data for each transaction and iterate over the inputs
|
|
# for each_input in get_xmr_tx(tx_cache, str(tx['tx_hash']))["inputs"]:
|
|
# # For each input iterate over each ring member
|
|
# for ring_member in each_input['mixins']:
|
|
# # Check to see if the ring members stealth address matches the current rings
|
|
# if ring_member['public_key'] == ring['public_key']:
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Occurrences'][str(ring_mem_num)] += 1
|
|
# transaction_entry['Inputs'][input_idx]['Previous_Tx_Decoy_Times'][str(ring_mem_num)].append(temp_block['timestamp'])
|
|
# except TypeError as e: # If there are no inputs
|
|
# pass
|
|
|
|
# Calculate lengths
|
|
transaction_entry['Num_Inputs'] = len(transaction_entry['Inputs'])
|
|
transaction_entry['Num_Outputs'] = len(transaction_entry['Outputs']['Output_Data'])
|
|
transaction_entry['Num_Output_Decoys'] = len(transaction_entry['Outputs']['Decoys_On_Chain'])
|
|
transaction_entry['Block_To_xmr2csv_Time_Delta'] = int((datetime.fromtimestamp(transaction_entry['xmr2csv_Data_Collection_Time']) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())
|
|
|
|
# Temporal Features
|
|
if len(transaction_entry['Inputs']) != 0:
|
|
for input_idx, each_input in enumerate(transaction_entry['Inputs']):
|
|
transaction_entry['Inputs'][input_idx]['Time_Deltas_Between_Ring_Members'] = {}
|
|
# A place to store the block times of each ring member
|
|
ring_mem_times = []
|
|
if len(each_input['Ring_Members']) != 0:
|
|
for ring_num, ring_mem in enumerate(each_input['Ring_Members']):
|
|
ring_mem_times.append(get_xmr_block(str(ring_mem['block_no']))['timestamp'])
|
|
# If the list has at least 2 items
|
|
if len(ring_mem_times) > 1:
|
|
time_delta = int((datetime.fromtimestamp(ring_mem_times[ring_num]) - datetime.fromtimestamp(ring_mem_times[ring_num - 1])).total_seconds())
|
|
transaction_entry['Inputs'][input_idx]['Time_Deltas_Between_Ring_Members'][str(ring_num-1) + '_' + str(ring_num)] = time_delta
|
|
if len(ring_mem_times) > 1:
|
|
# Add temporal features
|
|
# Calculate the total time span of the ring signature ( the newest ring on chain block time - oldest ring on chain block time )
|
|
transaction_entry['Inputs'][input_idx]['Total_Ring_Time_Span'] = int((datetime.fromtimestamp(ring_mem_times[len(ring_mem_times)-1]) - datetime.fromtimestamp(ring_mem_times[0])).total_seconds())
|
|
# Calculate the time between the newest ring in the signature to the block time of the transaction
|
|
transaction_entry['Inputs'][input_idx]['Time_Delta_From_Newest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(ring_mem_times[len(ring_mem_times)-1])).total_seconds())
|
|
# Calculate the time between the oldest ring in the signature to the block time of the transaction
|
|
transaction_entry['Inputs'][input_idx]['Time_Delta_From_Oldest_Ring_To_Block'] = int((datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch']) - datetime.fromtimestamp(ring_mem_times[0])).total_seconds())
|
|
# Calculate the mean of the ring time
|
|
transaction_entry['Inputs'][input_idx]['Mean_Ring_Time'] = int(sum(ring_mem_times) / len(ring_mem_times)) - ring_mem_times[0]
|
|
# Calculate the median of the ring time
|
|
transaction_entry['Inputs'][input_idx]['Median_Ring_Time'] = int(median(ring_mem_times)) - ring_mem_times[0]
|
|
|
|
# Move labels to Input dictionary (This is kinda jank but it's the best way I can think of)
|
|
for input_key_image, true_ring_position in transaction_entry['Input_True_Rings'].items():
|
|
# Match the true spent ring's key image to one of the inputs
|
|
for each_input in transaction_entry['Inputs']:
|
|
if each_input['Key_Image'] == input_key_image:
|
|
# add a field for the input for the true ring spent
|
|
each_input['Ring_no/Ring_size'] = true_ring_position
|
|
# Delete the temporary dict() holding the true ring positions
|
|
del transaction_entry['Input_True_Rings']
|
|
|
|
# Temporal features for decoys on chain
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain'] = {}
|
|
if len(transaction_entry['Outputs']['Decoys_On_Chain']) != 0:
|
|
# A place to store the block times of each ring member
|
|
decoys_on_chain_times = []
|
|
for member_idx, each_member in enumerate(transaction_entry['Outputs']['Decoys_On_Chain']):
|
|
decoys_on_chain_times.append(get_xmr_block(str(each_member['Block_Number']))['timestamp'])
|
|
# If the list has at least 2 items
|
|
if len(decoys_on_chain_times) > 1:
|
|
time_delta = int((datetime.fromtimestamp(decoys_on_chain_times[member_idx]) - datetime.fromtimestamp(decoys_on_chain_times[member_idx - 1])).total_seconds())
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain'][str(member_idx-1) + '_' + str(member_idx)] = time_delta
|
|
# Add temporal features
|
|
# Calculate the total time span of the ring signature ( newest ring on chain block time - oldest ring on chain block time )
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Total_Decoy_Time_Span'] = int((datetime.fromtimestamp(decoys_on_chain_times[len(decoys_on_chain_times)-1]) - datetime.fromtimestamp(decoys_on_chain_times[0])).total_seconds())
|
|
# Calculate the time between the newest ring in the signature to the block time of the transaction
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Time_Delta_From_Newest_Decoy_To_Block'] = int((datetime.fromtimestamp(decoys_on_chain_times[len(decoys_on_chain_times)-1]) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())
|
|
# Calculate the time between the oldest ring in the signature to the block time of the transaction
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Time_Delta_From_Oldest_Decoy_To_Block'] = int((datetime.fromtimestamp(decoys_on_chain_times[0]) - datetime.fromtimestamp(transaction_entry['Block_Timestamp_Epoch'])).total_seconds())
|
|
# Calculate the mean of the ring time
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Mean_Decoy_Time'] = sum(decoys_on_chain_times) / len(decoys_on_chain_times) - decoys_on_chain_times[0]
|
|
# Calculate the median of the ring time
|
|
transaction_entry['Outputs']['Time_Deltas_Between_Decoys_On_Chain']['Median_Decoy_Time'] = int(median(decoys_on_chain_times)) - decoys_on_chain_times[0]
|
|
return tx_hash, transaction_entry
|
|
|
|
|
|
def combine_files(Wallet_info):
|
|
"""
|
|
|
|
:param Wallet_info:
|
|
:return:
|
|
"""
|
|
Wallet_addr = Wallet_info[0]
|
|
Wallet_dir = Wallet_info[1]
|
|
# CSV HEADER -> "block, direction, unlocked, timestamp, amount, running balance, hash, payment ID, fee, destination, amount, index, note"
|
|
# 0 1 2 3 4 5 6 7 8 9 10 11 12
|
|
wallet_tx_data = {}
|
|
|
|
# Do some error checking, make sure the file exists
|
|
if exists(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv"):
|
|
# Open the file and get the number of lines
|
|
with open(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv", "r") as f:
|
|
# If the file only has 1 line than it's just the csv header and the wallet had no transactions
|
|
if len(f.readlines()) > 1:
|
|
# If there is transactions open the file and start parsing
|
|
with open(Wallet_dir + "/cli_export_" + Wallet_addr + ".csv", "r") as fp:
|
|
next(fp) # Skip header of csv
|
|
for line in fp:
|
|
cli_csv_values = line.split(",")
|
|
if cli_csv_values[1].strip() == "out": # Only add outgoing transactions to the dataset
|
|
# Check if the hash is a key in the dataset
|
|
if cli_csv_values[6].strip() not in wallet_tx_data.keys():
|
|
transaction = {}
|
|
transaction['Block_Number'] = int(cli_csv_values[0].strip())
|
|
transaction['Direction'] = cli_csv_values[1].strip()
|
|
transaction['Block_Timestamp'] = cli_csv_values[3].strip()
|
|
# Convert timestamp to epoch time
|
|
p = "%Y-%m-%d %H:%M:%S"
|
|
epoch = datetime(1970, 1, 1)
|
|
transaction['Block_Timestamp_Epoch'] = int((datetime.strptime(transaction['Block_Timestamp'].strip(), p) - epoch).total_seconds())
|
|
|
|
transaction['Amount'] = float(cli_csv_values[4].strip())
|
|
transaction['Wallet_Balance'] = float(cli_csv_values[5].strip())
|
|
transaction['Tx_Fee'] = float(cli_csv_values[8].strip())
|
|
transaction['Destination_Address'] = cli_csv_values[9].strip()
|
|
transaction['Sender_Address'] = Wallet_addr
|
|
transaction['Network'] = NETWORK
|
|
|
|
transaction['Outputs'] = {}
|
|
transaction['Outputs']['Output_Data'] = list()
|
|
transaction['Outputs']['Decoys_On_Chain'] = []
|
|
transaction['Inputs'] = []
|
|
|
|
# Add the time that xmr2csv was run
|
|
with open(Wallet_dir + "/xmr2csv_start_time_" + Wallet_addr + ".csv", "r") as fp2:
|
|
for line2 in fp2:
|
|
transaction['xmr2csv_Data_Collection_Time'] = int(line2.strip())
|
|
break
|
|
# Add the transaction
|
|
wallet_tx_data[cli_csv_values[6].strip()] = transaction
|
|
|
|
# CSV HEADER -> "Timestamp,Block_no,Tx_hash,Tx_public_key,Tx_version,Payment_id,Out_idx,Amount,Output_pub_key,Output_key_img,Output_spend"
|
|
# 0 1 2 3 4 5 6 7 8 9 10
|
|
with open(Wallet_dir + "/xmr_report_" + Wallet_addr + ".csv", "r") as fp:
|
|
next(fp) # Skip header of csv
|
|
for line in fp:
|
|
xmr2csv_report_csv_values = line.split(",")
|
|
tx_hash = xmr2csv_report_csv_values[2].strip()
|
|
# Check if the tx hash is in the dataset yet
|
|
if tx_hash in wallet_tx_data.keys():
|
|
wallet_tx_data[tx_hash]['Tx_Version'] = float(xmr2csv_report_csv_values[4].strip())
|
|
wallet_tx_data[tx_hash]['Tx_Public_Key'] = xmr2csv_report_csv_values[3].strip()
|
|
wallet_tx_data[tx_hash]['Output_Pub_Key'] = xmr2csv_report_csv_values[8].strip()
|
|
wallet_tx_data[tx_hash]['Output_Key_Img'] = xmr2csv_report_csv_values[9].strip()
|
|
wallet_tx_data[tx_hash]['Out_idx'] = int(xmr2csv_report_csv_values[6].strip())
|
|
wallet_tx_data[tx_hash]['Wallet_Output_Number_Spent'] = int(xmr2csv_report_csv_values[10].strip())
|
|
# Add Output Information
|
|
output_info = get(API_URL + "/transaction/" + str(tx_hash)).json()["data"]['outputs']
|
|
for output_idx, output in enumerate(output_info):
|
|
wallet_tx_data[tx_hash]['Outputs']['Output_Data'].append({'Amount': output['amount'], 'Stealth_Address': output['public_key']})
|
|
|
|
# Open the file that has the timestamp from when the data was collected
|
|
with open(Wallet_dir + "/xmr2csv_start_time_" + Wallet_addr + ".csv", "r") as fp2:
|
|
for line2 in fp2:
|
|
wallet_tx_data[tx_hash]['xmr2csv_Data_Collection_Time'] = int(line2.strip())
|
|
break
|
|
|
|
# Search through the export of all ring member occurrences on chain to see if our output public key was used
|
|
# CSV HEADERS -> "Timestamp, Block_no, Decoy_Tx_hash, Output_pub_key, Key_image, ring_no/ring_size"
|
|
# 0 1 2 3 4 5
|
|
with open(Wallet_dir + "/xmr_report_ring_members_" + Wallet_addr + ".csv", "r") as fp2:
|
|
next(fp2) # Skip header of csv
|
|
for line2 in fp2:
|
|
ring_members_csv_values = line2.split(",")
|
|
Ring_Member = {}
|
|
# Iterate through each output from the transaction
|
|
for tx_output in wallet_tx_data[tx_hash]['Outputs']['Output_Data']:
|
|
# Check if the ring members public key matches an output in this transaction
|
|
if tx_output['Stealth_Address'] == ring_members_csv_values[3].strip():
|
|
Ring_Member['Output_Pub_Key'] = ring_members_csv_values[3].strip()
|
|
Ring_Member['Block_Number'] = int(ring_members_csv_values[1].strip())
|
|
# Convert timestamp to epoch time before saving
|
|
# https://stackoverflow.com/questions/30468371/how-to-convert-python-timestamp-string-to-epoch
|
|
p = "%Y-%m-%d %H:%M:%S"
|
|
epoch = datetime(1970, 1, 1)
|
|
ring_member_epoch_time = int((datetime.strptime(ring_members_csv_values[0].strip(), p) - epoch).total_seconds())
|
|
Ring_Member['Block_Timestamp'] = ring_member_epoch_time
|
|
Ring_Member['Key_image'] = ring_members_csv_values[4].strip()
|
|
Ring_Member['Tx_Hash'] = ring_members_csv_values[2].strip()
|
|
Ring_Member['Ring_no/Ring_size'] = ring_members_csv_values[5].strip()
|
|
# Find the relative age of the outputs public key on the chain compared to when xmr2csv was ran
|
|
# The time from when the data was collected minus the decoy block timestamp
|
|
Ring_Member['Ring_Member_Relative_Age'] = int((datetime.fromtimestamp(wallet_tx_data[tx_hash]['xmr2csv_Data_Collection_Time']) - datetime.fromtimestamp(Ring_Member['Block_Timestamp'])).total_seconds())
|
|
|
|
# CSV HEADERS -> "Output_pub_key, Frequency, Ring_size"
|
|
# 0 1 2
|
|
with open(Wallet_dir + "/xmr_report_ring_members_freq_" + Wallet_addr + ".csv", "r") as fp3:
|
|
next(fp3) # Skip header of csv
|
|
for line3 in fp3:
|
|
ring_member_freq_csv_values = line3.split(",")
|
|
# Check if the ring members public key matches the current public key
|
|
if wallet_tx_data[tx_hash]['Output_Pub_Key'] == ring_member_freq_csv_values[0].strip():
|
|
# Add the amount of times it has been seen on chain
|
|
Ring_Member['Ring_Member_Freq'] = int(ring_member_freq_csv_values[1].strip())
|
|
wallet_tx_data[tx_hash]['Outputs']['Decoys_On_Chain'].append(Ring_Member)
|
|
# Only collect 10 decoys found on chain because it gets too resource intensive when
|
|
# calculating all the temporal features for every decoy's ring signatures
|
|
if len(wallet_tx_data[tx_hash]['Outputs']['Decoys_On_Chain']) >= 10:
|
|
break
|
|
|
|
# CSV HEADERS -> "Timestamp, Block_no, Tx_hash, Output_pub_key, Key_image, Ring_no/Ring_size"
|
|
# 0 1 2 3 4 5
|
|
with open(Wallet_dir + "/xmr_report_outgoing_txs_" + Wallet_addr + ".csv", "r") as fp:
|
|
next(fp) # Skip header of csv
|
|
for line in fp:
|
|
xmr2csv_outgoing_csv_values = line.split(",")
|
|
# Make sure the hash exists in the dataset
|
|
if xmr2csv_outgoing_csv_values[2].strip() in wallet_tx_data.keys():
|
|
# Check if there is a dictionary to keep track of input true spends (labels)
|
|
if 'Input_True_Rings' not in wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()].keys():
|
|
wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()]['Input_True_Rings'] = {}
|
|
# Set the key image as the dictionary key and 'Ring_no/Ring_size' as the value
|
|
wallet_tx_data[xmr2csv_outgoing_csv_values[2].strip()]['Input_True_Rings'][xmr2csv_outgoing_csv_values[4].strip()] = xmr2csv_outgoing_csv_values[5].strip()
|
|
else:
|
|
print(yellow + "Warning: " + reset + str(Wallet_dir) + " did not contain any transactions!")
|
|
return wallet_tx_data
|
|
|
|
|
|
def discover_wallet_directories(dir_to_search):
|
|
"""
|
|
|
|
:param dir_to_search:
|
|
:return:
|
|
"""
|
|
# ERROR Checking if the directory is empty or not
|
|
try:
|
|
if len(listdir(dir_to_search)) == 0:
|
|
print(red + "Error: {} is an empty directory!".format(dir_to_search) + reset)
|
|
exit(1)
|
|
except FileNotFoundError as e:
|
|
print(red + "Error: {} is a non-existent directory!".format(dir_to_search) + reset)
|
|
exit(1)
|
|
|
|
# traverse root directory, and list directories as dirs and files as files
|
|
unique_directories = []
|
|
for root, dirs, files in walk(dir_to_search):
|
|
for name in files:
|
|
# Find all csv files
|
|
if name.lower().endswith(".csv"):
|
|
# Find all the unique folders holding csv files
|
|
if root not in unique_directories:
|
|
unique_directories.append(root)
|
|
cwd = getcwd() # Set a starting directory
|
|
|
|
Wallet_addrs = []
|
|
Wallet_info = []
|
|
# Go through each directory that has csv files in it
|
|
for idx, dir in tqdm(enumerate(unique_directories), desc="Enumerating Wallet Folders", total=len(unique_directories), colour='blue'):
|
|
chdir(dir)
|
|
# Iterate over the files in the directory
|
|
for root, dirs, files in walk("."):
|
|
for name in files: # Get the file name
|
|
# Get each csv file
|
|
if name.lower().endswith(".csv"):
|
|
# Extract the 2 unique wallet addr from the name of the files
|
|
addr = name[::-1].split(".")[1].split("_")[0][::-1]
|
|
if addr not in Wallet_addrs:
|
|
Wallet_info.append([addr, dir])
|
|
Wallet_addrs.append(addr)
|
|
# Dont keep looking if the two wallet addresses are already found
|
|
if len(Wallet_addrs) == 2:
|
|
break
|
|
chdir(cwd)
|
|
chdir(cwd)
|
|
|
|
del Wallet_addrs # Not needed anymore
|
|
collect() # Garbage Collector
|
|
|
|
global data # Import the global database
|
|
total_txs = 0
|
|
num_bad_txs = 0
|
|
pool = Pool(processes=NUM_PROCESSES) # Multiprocessing pool
|
|
# Multiprocess combining the 6 csv files for each wallet
|
|
for wallet_tx_data in tqdm(pool.imap_unordered(func=combine_files, iterable=Wallet_info), desc="(Multiprocessing) Combining Exported Wallet Files", total=len(Wallet_info), colour='blue'):
|
|
# Make sure there are transactions in the data before adding it to the dataset
|
|
for tx_hash, tx_data in wallet_tx_data.items():
|
|
if "Input_True_Rings" in tx_data.keys():
|
|
data[tx_hash] = tx_data
|
|
total_txs += 1
|
|
else:
|
|
num_bad_txs += 1
|
|
print("There were " + str(num_bad_txs) + " bad transactions that were deleted out of a total " + str(total_txs) + " transactions!")
|
|
print("The dataset now includes " + str(len(data)) + " transactions.")
|
|
|
|
|
|
def clean_transaction(transaction):
|
|
"""
|
|
A transaction from the original dataset contains information not
|
|
necessarily useful for training a machine learning model. This
|
|
information includes cryptographically random strings ( wallet
|
|
addresses, and private keys ) as well as human-readable strings.
|
|
This function will also strip any "deanonymized" features and
|
|
return them in a separate dictionary to be added to the labels.
|
|
:param transaction: A dictionary of transaction information
|
|
:return: A dictionary of labels associated to the inputted transaction
|
|
"""
|
|
private_info = {}
|
|
del transaction['Tx_Version']
|
|
del transaction['Block_Number']
|
|
del transaction['Block_Timestamp_Epoch']
|
|
del transaction['Num_Confirmations']
|
|
private_info['True_Ring_Pos'] = {}
|
|
del transaction['Direction']
|
|
del transaction['Block_Timestamp']
|
|
private_info['Tx_Amount'] = transaction['Amount']
|
|
del transaction['Amount']
|
|
private_info['Wallet_Balance'] = transaction['Wallet_Balance']
|
|
del transaction['Wallet_Balance']
|
|
del transaction['Destination_Address']
|
|
del transaction['Sender_Address']
|
|
del transaction['Network']
|
|
del transaction['Outputs']
|
|
# del transaction['Outputs']['Output_Data']
|
|
# del transaction['Outputs']['Decoys_On_Chain'] # TODO NEED TO EXPAND UPON THIS
|
|
for idx, input in enumerate(transaction['Inputs']):
|
|
del input['Key_Image']
|
|
del input['Ring_Members']
|
|
private_info['True_Ring_Pos'][idx] = input['Ring_no/Ring_size']
|
|
del input['Ring_no/Ring_size']
|
|
del transaction['xmr2csv_Data_Collection_Time']
|
|
del transaction['Tx_Public_Key']
|
|
del transaction['Output_Pub_Key']
|
|
del transaction['Output_Key_Img']
|
|
private_info['Out_idx'] = transaction['Out_idx']
|
|
del transaction['Out_idx']
|
|
private_info['Wallet_Output_Number_Spent'] = transaction['Wallet_Output_Number_Spent']
|
|
del transaction['Wallet_Output_Number_Spent']
|
|
del transaction['Payment_ID']
|
|
del transaction['Payment_ID8']
|
|
del transaction['Time_Of_Enrichment']
|
|
del transaction['Tx_Extra'] # TODO NEED TO USE THIS LATER ON
|
|
del transaction['Num_Output_Decoys'] # TODO
|
|
del transaction['Block_To_xmr2csv_Time_Delta']
|
|
return private_info
|
|
|
|
|
|
def create_feature_set(database):
|
|
"""
|
|
This function takes in a nested python dictionary dataset, removes
|
|
any entries that would not be a useful feature to a machine learning
|
|
model, flattens the dictionary and converts it to a dataframe. An
|
|
accompanying labels list is also returned.
|
|
:param database: Nested dictionary of Monero transaction metadata
|
|
:return: A pandas dataframe of the input data and a list of labels
|
|
"""
|
|
labels = []
|
|
Valid_Transactions = []
|
|
num_errors = 0
|
|
feature_set = dict()
|
|
num_of_valid_txs = 0 # Incrementer which doesn't count invalid txs
|
|
# Iterate through each tx hash in the database dict
|
|
for idx, tx_hash in tqdm(enumerate(database.keys()), total=len(database), colour='blue', desc="Cleaning Transactions"):
|
|
# Pass the transaction ( by reference ) to be stripped of non-features and receive the labels back
|
|
try:
|
|
private_info = clean_transaction(database[tx_hash])
|
|
except Exception as e:
|
|
num_errors += 1
|
|
continue # Dont process the tx and loop
|
|
# add tx hash to good list
|
|
#Valid_Transactions.append(DataFrame(CherryPicker(database[tx_hash]).flatten(delim='.').get(), index=[idx]))
|
|
# Flatten each transaction and iterate over each feature
|
|
for k, v in CherryPicker(database[tx_hash]).flatten(delim='.').get().items():
|
|
# Check if the feature name is not already in the feature set
|
|
if k not in feature_set.keys():
|
|
feature_set[k] = []
|
|
# add any missing values
|
|
for i in range(num_of_valid_txs-1):
|
|
feature_set[k].append(-1)
|
|
# Add it as a new feature
|
|
feature_set[k].append(v)
|
|
else: # If the feature is already in the feature set
|
|
# Check if there are any transactions that did not have this feature
|
|
if len(feature_set[k]) < num_of_valid_txs:
|
|
# Add -1 for those occurrences
|
|
for i in range(num_of_valid_txs-len(feature_set[k])-1):
|
|
feature_set[k].append(-1)
|
|
# Append the feature
|
|
feature_set[k].append(v)
|
|
num_of_valid_txs += 1
|
|
# add the labels to the list
|
|
labels.append(private_info)
|
|
|
|
print("Number of skipped transactions:", num_errors)
|
|
assert len(labels) != 0
|
|
del database
|
|
collect() # Garbage Collector
|
|
feature_set_df = DataFrame.from_dict(feature_set, orient='index').transpose()
|
|
feature_set_df.fillna(-1, inplace=True)
|
|
|
|
del feature_set
|
|
collect() # Garbage collector
|
|
|
|
# Sanity check
|
|
assert len(labels) == len(feature_set_df)
|
|
|
|
# Combine dataframes together
|
|
# https://www.confessionsofadataguy.com/solving-the-memory-hungry-pandas-concat-problem/
|
|
#feature_set = concat(Valid_Transactions, axis=0).fillna(-1)
|
|
|
|
# Shuffle the data
|
|
feature_set_df, labels = shuffle(feature_set_df, labels)
|
|
feature_set_df, labels = shuffle(feature_set_df, labels)
|
|
feature_set_df, labels = shuffle(feature_set_df, labels)
|
|
|
|
# Reset the indexing after the shuffles
|
|
feature_set_df.reset_index(drop=True, inplace=True)
|
|
|
|
# Replace any Null values with -1
|
|
return feature_set_df, labels
|
|
|
|
|
|
def undersample_processing(y, ns, min_occurrences, occurrences):
|
|
"""
|
|
|
|
:param y:
|
|
:param ns:
|
|
:param min_occurrences:
|
|
:param occurrences:
|
|
:return:
|
|
"""
|
|
undersampled_y = []
|
|
new_X = []
|
|
y_idx, ring_array = y
|
|
# For each array of ring members iterate over each index
|
|
for ring_array_idx in range(len(ring_array["True_Ring_Pos"])):
|
|
# Get the true ring position (label) for the current iteration
|
|
ring_pos = int(ring_array["True_Ring_Pos"][ring_array_idx].split("/")[0])
|
|
total_rings = int(ring_array["True_Ring_Pos"][ring_array_idx].split("/")[1])
|
|
# Check to see if we hit the maximum number of labels for this position and that the
|
|
# number of ring members is what we expect.
|
|
if occurrences[ring_pos] < min_occurrences and total_rings == NUM_RING_MEMBERS:
|
|
occurrences[ring_pos] = occurrences[ring_pos] + 1
|
|
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
|
|
# Slice out the row from the dataframe but keep it as a dataframe
|
|
temp_df = ns.X.iloc[[y_idx]]
|
|
# Go through each column name in the temp dataframe
|
|
for col_name in temp_df.columns:
|
|
# Check if the column name has data relating to irrelevant ring signatures
|
|
if "Inputs." in col_name and "." + str(ring_array_idx) + "." not in col_name:
|
|
# Delete the columns
|
|
temp_df = temp_df.drop([col_name], axis=1)
|
|
# Check if the column name is for the current ring signature
|
|
elif "Inputs." in col_name and "." + str(ring_array_idx) + "." in col_name:
|
|
# Rename the column such that it doesn't have the .0. or .1. positioning information
|
|
temp_df.rename(columns={col_name: col_name.replace("Inputs." + str(ring_array_idx) + ".", "Input.")}, inplace=True)
|
|
# Add to the new X and y dataframes
|
|
new_X.append(temp_df)
|
|
undersampled_y.append(ring_pos)
|
|
return new_X, undersampled_y
|
|
|
|
|
|
def undersample_processing_wrapper(y_X_min_occurrences_Occurrences):
|
|
"""
|
|
|
|
:param y_X_min_occurrences_Occurrences:
|
|
:return:
|
|
"""
|
|
return undersample_processing(*y_X_min_occurrences_Occurrences)
|
|
|
|
|
|
def undersample(X, y):
|
|
"""
|
|
|
|
:param X:
|
|
:param y:
|
|
:return:
|
|
"""
|
|
# Flatten the ring signature labels into a list
|
|
flattened_true_spend = []
|
|
for ring_array in y:
|
|
for idx, true_ring_pos in ring_array["True_Ring_Pos"].items():
|
|
flattened_true_spend.append(int(true_ring_pos.split("/")[0]))
|
|
# Reset pandas indexing just incase
|
|
X.reset_index(drop=True, inplace=True)
|
|
# Count the amount of true labels at each position in the ring signature
|
|
labels_distribution = Counter(flattened_true_spend)
|
|
|
|
# Error checking
|
|
try:
|
|
# Make sure that there are no classes with 0 labels
|
|
assert len(labels_distribution) == NUM_RING_MEMBERS
|
|
except AssertionError as e:
|
|
print(red + "Error: The dataset contains at least one class which has 0 labels!" + reset)
|
|
exit(1)
|
|
|
|
# Find the smallest number of occurrences
|
|
min_occurrences = labels_distribution.most_common()[len(labels_distribution)-1][1]
|
|
print("Undersampling to " + str(min_occurrences) + " transactions per class. A total of " + str(min_occurrences*NUM_RING_MEMBERS) + " transactions.")
|
|
#max_occurrences = labels_distribution.most_common(1)[0][1]
|
|
|
|
undersampled_y = []
|
|
new_X = []
|
|
with Manager() as manager:
|
|
# https://stackoverflow.com/questions/19887087/how-to-share-pandas-dataframe-object-between-processes
|
|
ns = manager.Namespace()
|
|
ns.X = X
|
|
# Create a dictionary for all 11 spots in a ring signature
|
|
occurrences = manager.dict()
|
|
for i in range(NUM_RING_MEMBERS):
|
|
occurrences[i + 1] = 0
|
|
# Multiprocessing enriching each transaction
|
|
with manager.Pool(processes=NUM_PROCESSES) as pool:
|
|
for result in tqdm(pool.imap_unordered(func=undersample_processing_wrapper,
|
|
iterable=zip(
|
|
list(enumerate(y)),
|
|
repeat(ns, len(y)),
|
|
repeat(min_occurrences, len(y)),
|
|
repeat(occurrences, len(y))
|
|
)
|
|
),
|
|
desc="(Multiprocessing) Undersampling Dataset",
|
|
total=len(y),
|
|
colour='blue'
|
|
):
|
|
subset_new_X = result[0]
|
|
subset_undersampled_y = result[1]
|
|
# Add to the new X and y dataframes
|
|
new_X = new_X + subset_new_X
|
|
undersampled_y = undersampled_y + subset_undersampled_y
|
|
|
|
del X # Remove the old dataset to save RAM
|
|
collect() # Garbage collector
|
|
|
|
# Combine the list of dataframes together into a single DF
|
|
undersampled_X = concat(new_X, axis=0)
|
|
del new_X
|
|
collect() # Garbage collector
|
|
|
|
# Sometimes there is a race condition where a class will get +1 samples in the class ( most of the time this happens while debugging )
|
|
assert len(undersampled_X) == len(undersampled_y) == (min_occurrences * NUM_RING_MEMBERS)
|
|
|
|
# Shuffle the data one last time
|
|
undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
|
|
undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
|
|
undersampled_X, undersampled_y = shuffle(undersampled_X, undersampled_y)
|
|
undersampled_X.reset_index(drop=True, inplace=True)
|
|
return undersampled_X, undersampled_y
|
|
|
|
|
|
def main():
|
|
# Error Checking for command line args
|
|
if len(argv) != 2:
|
|
print("Usage Error: ./create_dataset.py < Wallets Directory Path >")
|
|
exit(1)
|
|
try: # Check to see if the API URL given can be connected
|
|
assert get(API_URL + "/block/1").status_code == 200
|
|
except ConnectionError as e:
|
|
print("Error: " + red + NETWORK + reset + " block explorer located at " + API_URL + " refused connection!")
|
|
exit(1)
|
|
# Configuration alert
|
|
print("The dataset is being collected for the " + blue + NETWORK + reset + " network using " + API_URL + " as a block explorer!")
|
|
|
|
###########################################
|
|
# Create the dataset from files on disk #
|
|
###########################################
|
|
global data
|
|
print(blue + "Opening " + str(argv[1]) + reset + "\n")
|
|
# Find where the wallets are stored and combine the exported csv files
|
|
discover_wallet_directories(argv[1])
|
|
|
|
# Multiprocessing References
|
|
# https://leimao.github.io/blog/Python-tqdm-Multiprocessing/
|
|
# https://thebinarynotes.com/python-multiprocessing/
|
|
# https://docs.python.org/3/library/multiprocessing.html
|
|
# https://stackoverflow.com/questions/6832554/multiprocessing-how-do-i-share-a-dict-among-multiple-processes
|
|
with Manager() as manager:
|
|
# Multiprocessing enriching each transaction
|
|
with manager.Pool(processes=NUM_PROCESSES) as pool:
|
|
for result in tqdm(pool.imap_unordered(func=enrich_data, iterable=list(data.items())), desc="(Multiprocessing) Enriching Transaction Data", total=len(data), colour='blue'):
|
|
tx_hash, transaction_entry = result[0], result[1] # Unpack the values returned
|
|
data[tx_hash] = transaction_entry # Set the enriched version of the tx
|
|
|
|
# Save the raw database to disk
|
|
with open("./Dataset_Files/dataset.pkl", "wb") as fp:
|
|
pickle.dump(data, fp)
|
|
print("./Dataset_Files/dataset.pkl written to disk!")
|
|
|
|
#################################
|
|
# Remove Unnecessary Features #
|
|
#################################
|
|
with open("./Dataset_Files/dataset.pkl", "rb") as fp:
|
|
data = pickle.load(fp)
|
|
# Feature selection on raw dataset
|
|
X, y = create_feature_set(data)
|
|
del data
|
|
collect() # Garbage collector
|
|
|
|
# Save data and labels to disk for future AI training
|
|
with open("./Dataset_Files/X.pkl", "wb") as fp:
|
|
pickle.dump(X, fp)
|
|
with open("./Dataset_Files/y.pkl", "wb") as fp:
|
|
pickle.dump(y, fp)
|
|
# Error checking; labels and data should be the same length
|
|
assert len(X) == len(y)
|
|
print("./Dataset_Files/X.pkl and ./Dataset_Files/y.pkl written to disk!")
|
|
|
|
###################
|
|
# Undersampling #
|
|
###################
|
|
|
|
with open("./Dataset_Files/X.pkl", "rb") as fp:
|
|
X = pickle.load(fp)
|
|
with open("./Dataset_Files/y.pkl", "rb") as fp:
|
|
y = pickle.load(fp)
|
|
|
|
X_Undersampled, y_Undersampled = undersample(X, y)
|
|
del X
|
|
collect() # Garbage collector
|
|
|
|
with open("./Dataset_Files/X_Undersampled.pkl", "wb") as fp:
|
|
pickle.dump(X_Undersampled, fp)
|
|
with open("./Dataset_Files/y_Undersampled.pkl", "wb") as fp:
|
|
pickle.dump(y_Undersampled, fp)
|
|
|
|
print("./Dataset_Files/X_Undersampled.pkl and ./Dataset_Files/y_Undersampled.pkl written to disk!\nFinished")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
main()
|
|
# Gracefully exits if user hits CTRL + C
|
|
except KeyboardInterrupt as e:
|
|
print("Error: User stopped the script's execution!")
|
|
exit(1)
|
|
except Exception as e:
|
|
import traceback
|
|
print(e)
|
|
print(traceback.print_exc())
|
|
exit(1)
|