mirror of
https://github.com/MAGICGrants/Monero-Dataset-Pipeline.git
synced 2026-01-09 13:37:57 -05:00
- Filled in comments
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
# Usage: This script expects to be placed into a folder that contains wallets. Each wallet should
|
||||
# have a Test-Wallet1, Test-Wallet1.keys, and Test-Wallet1.address.txt file.
|
||||
# If you use a remote node for the export portion of the script, just note that
|
||||
# you will need a local copy of the blockchain for the xmr2csv portion of the script
|
||||
# you will need a local copy of the blockchain for the xmr2csv portion of the script.
|
||||
|
||||
|
||||
# Global variables
|
||||
|
||||
@@ -21,9 +21,10 @@ from os import walk, getcwd, chdir, listdir, fsync, system, remove
|
||||
options.mode.chained_assignment = None # default='warn'
|
||||
|
||||
'''
|
||||
Description:
|
||||
Description: The main processing file that will find all the exported csv files and
|
||||
convert them into a database suitable for machine learning.
|
||||
Usage: ./create_dataset.py < Wallets Directory Path >
|
||||
Date: 7/8/2022
|
||||
Date: 8/17/2022
|
||||
Author: ACK-J
|
||||
|
||||
Warning: DO NOT run this processing with a remote node, there are a lot of blockchain lookups and it will be slow!
|
||||
@@ -32,6 +33,8 @@ To run your own node and block explorer:
|
||||
monerod --stagenet https://github.com/monero-project/monero
|
||||
xmrblocks --stagenet --enable-json-api https://github.com/moneroexamples/onion-monero-blockchain-explorer
|
||||
|
||||
For the feature engineering to complete successfully, you will need to set up https://github.com/neptuneresearch/ring-membership-sql
|
||||
for the desired network of choice (stagenet, testnet, mainnet)
|
||||
'''
|
||||
|
||||
######################
|
||||
@@ -45,11 +48,11 @@ API_URL = "http://127.0.0.1:8081/api" # Local Monero Block Explorer
|
||||
NUM_RING_MEMBERS = 11 # DL models depend on a discrete number of rings
|
||||
PREDICTING = False # If True, no undersampling will happen
|
||||
|
||||
POSTGRES_SQL_HOST = "127.0.0.1"
|
||||
POSTGRES_SQL_PORT = "18333"
|
||||
POSTGRES_SQL_USERNAME = "xmrack"
|
||||
POSTGRES_SQL_PASSWORD = "xmrack"
|
||||
POSTGRES_SQL_DB_NAME = "xmrstagedb"
|
||||
POSTGRES_SQL_HOST = "127.0.0.1" # The IP address of the PostgreSQL ring membership server
|
||||
POSTGRES_SQL_PORT = "18333" # The port of the PostgreSQL server
|
||||
POSTGRES_SQL_USERNAME = "xmrack" # The username of the PostgreSQL server
|
||||
POSTGRES_SQL_PASSWORD = "xmrack" # The password of the PostgreSQL server
|
||||
POSTGRES_SQL_DB_NAME = "xmrstagedb" # The name of the database
|
||||
|
||||
###################################################################################
|
||||
# You shouldn't need to edit anything below this line unless things break #
|
||||
@@ -73,7 +76,7 @@ def get_xmr_tx(tx_hash):
|
||||
|
||||
def enrich_data(tx_dict_item):
|
||||
"""
|
||||
|
||||
The heavy lifting of the script that adds new features by querying the blockchain and the Postgres ring signature DB.
|
||||
:param tx_dict_item:
|
||||
:return:
|
||||
"""
|
||||
@@ -261,9 +264,9 @@ ORDER BY input_pos, input_mem_idx, height_B ASC
|
||||
|
||||
def combine_files(Wallet_info):
|
||||
"""
|
||||
|
||||
:param Wallet_info:
|
||||
:return:
|
||||
Combine the multiple csv files and extract the unique values from each
|
||||
:param Wallet_info: A tuple of (Wallet_addr, Wallet_dir)
|
||||
:return: A dictionary of transaction metadata
|
||||
"""
|
||||
Wallet_addr = Wallet_info[0]
|
||||
Wallet_dir = Wallet_info[1]
|
||||
@@ -371,9 +374,8 @@ def combine_files(Wallet_info):
|
||||
|
||||
def discover_wallet_directories(dir_to_search):
|
||||
"""
|
||||
|
||||
:param dir_to_search:
|
||||
:return:
|
||||
Discovers the csv files exported by collect.sh
|
||||
:param dir_to_search: The directory to start the recursive search
|
||||
"""
|
||||
# ERROR Checking if the directory is empty or not
|
||||
try:
|
||||
@@ -450,10 +452,10 @@ def clean_transaction(transaction):
|
||||
necessarily useful for training a machine learning model. This
|
||||
information includes cryptographically random strings ( wallet
|
||||
addresses, and private keys ) as well as human-readable strings.
|
||||
This function will also strip any "deanonymized" features and
|
||||
This function will strip any "secret" features and
|
||||
return them in a separate dictionary to be added to the labels.
|
||||
:param transaction: A dictionary of transaction information
|
||||
:return: A dictionary of labels associated to the inputted transaction
|
||||
:param transaction: A dictionary of transaction information, passed by reference
|
||||
:return: private information within the transaction
|
||||
"""
|
||||
private_info = {}
|
||||
del transaction['Tx_Version']
|
||||
@@ -495,7 +497,8 @@ def clean_transaction(transaction):
|
||||
|
||||
def create_feature_set(database):
|
||||
"""
|
||||
|
||||
Converts the python dictionary into a Pandas dataframe and removes
|
||||
any columns that would not be useful to a machine learning classifier
|
||||
:param database: Nested dictionary of Monero transaction metadata
|
||||
:return: A pandas dataframe of the input data and a list of labels
|
||||
"""
|
||||
@@ -553,20 +556,20 @@ def create_feature_set(database):
|
||||
|
||||
# Shuffle the data
|
||||
feature_set_df, labels = shuffle(feature_set_df, labels, random_state=69)
|
||||
# Reset the indexing after the shuffles
|
||||
# Reset the indexing after the shuffle
|
||||
feature_set_df.reset_index(drop=True, inplace=True)
|
||||
return feature_set_df, labels
|
||||
|
||||
|
||||
def undersample_processing(y, series, min_occurrences, occurrences, predicting):
|
||||
"""
|
||||
|
||||
:param y:
|
||||
:param series:
|
||||
:param min_occurrences:
|
||||
:param occurrences:
|
||||
:param predicting:
|
||||
:return:
|
||||
The processing stage of the undersampling process.
|
||||
:param y: The labels
|
||||
:param series: A single row in the dataset
|
||||
:param min_occurrences: The number of samples per class
|
||||
:param occurrences: A dictionary that tracks the number of occurrences per class in the undersampled dataset
|
||||
:param predicting: Boolean value, if true dont undersample
|
||||
:return: new_X, new_y
|
||||
"""
|
||||
new_y = []
|
||||
new_X = []
|
||||
@@ -597,11 +600,12 @@ def undersample_processing(y, series, min_occurrences, occurrences, predicting):
|
||||
|
||||
def undersample(X, y, predicting):
|
||||
"""
|
||||
|
||||
:param X:
|
||||
:param y:
|
||||
:param predicting:
|
||||
:return:
|
||||
Undersample the dataset such that every class has the same number of occurrences
|
||||
:param X: The data
|
||||
:param y: The labels
|
||||
:param predicting: Boolean value if predicting is true then do not undersample.
|
||||
For example, if you are predicting on mainnet samples.
|
||||
:return: undersampled_X, undersampled_y
|
||||
"""
|
||||
# Flatten the ring signature labels into a list
|
||||
flattened_true_spend = []
|
||||
@@ -736,9 +740,9 @@ def undersample(X, y, predicting):
|
||||
|
||||
def write_dict_to_csv(data_dict):
|
||||
"""
|
||||
|
||||
:param data_dict:
|
||||
:return:
|
||||
A custom function to convert the nested dictionary into a flattened CSV
|
||||
since there was no python module available.
|
||||
:param data_dict: The nested python dict
|
||||
"""
|
||||
# Keep track of all column names for the CSV
|
||||
column_names = []
|
||||
@@ -795,11 +799,11 @@ def write_dict_to_csv(data_dict):
|
||||
|
||||
def validate_data_integrity(X, y, undersampled=False):
|
||||
"""
|
||||
|
||||
:param X:
|
||||
:param y:
|
||||
:param undersampled:
|
||||
:return:
|
||||
Validates the during the processing of the data, no samples have accidentally
|
||||
changed positions from their associated record. This largely serves as a santity check.
|
||||
:param X: The modified data that needs to be checked
|
||||
:param y: The modified labels that needs to be checked
|
||||
:param undersampled: A boolean switch if the sanity check is before or after undersampling
|
||||
"""
|
||||
print(blue + "\nData Integrity Check" + reset)
|
||||
if undersampled:
|
||||
@@ -907,9 +911,8 @@ def validate_data_integrity(X, y, undersampled=False):
|
||||
|
||||
def delete_file(list_of_paths):
|
||||
"""
|
||||
|
||||
:param list_of_paths:
|
||||
:return:
|
||||
A simple function to delete a given file
|
||||
:param list_of_paths: Path to the file
|
||||
"""
|
||||
for path in list_of_paths:
|
||||
if exists(path):
|
||||
@@ -1043,7 +1046,7 @@ if __name__ == '__main__':
|
||||
except KeyboardInterrupt as e:
|
||||
print("Error: User stopped the script's execution!")
|
||||
exit(1)
|
||||
# All other raised errors, print the stack trace
|
||||
# Any other raised errors, print the stack trace
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(e)
|
||||
|
||||
32
run.sh
32
run.sh
@@ -1,20 +1,26 @@
|
||||
#!/bin/bash
|
||||
# This script will create pairs of wallets which will transact between eachother. Due to
|
||||
# monero's 20 minute lockout period, creating a large amount of simulated transactions is
|
||||
# difficult. This script automates the wallet creation, funding and will transact between
|
||||
# wallets infinitely. The only manual setup is to have a wallet with a large amount of testnet
|
||||
# coins within the root directory, and it must be named "FundingWallet".
|
||||
# This script will create pairs of wallets which will transact between eachother.
|
||||
# This script automates the wallet creation, funding and will transact between them
|
||||
# in the background until the specified stop time. The only manual setup is to have a wallet with a
|
||||
# large amount of coins within ./Funding_Wallets/, and it must be named ${NETWORK^}-Funding.
|
||||
|
||||
# Usage: chmod +x ./run.sh && ./run.sh
|
||||
# Dependencies: tmux, expect, monero-wallet-cli, curl, jq (I think that's everything...)
|
||||
|
||||
|
||||
|
||||
# Global variables of anything that would need to be changed in this file
|
||||
NETWORK="stagenet" # Case-sensitive (make all lowercase)
|
||||
NETWORK="stagenet" # Case-sensitive, Make sure it is all lowercase (testnet, stagenet, mainnet)
|
||||
if [[ "$NETWORK" == "stagenet" ]];then PORT="38081"; else PORT="28081"; fi
|
||||
REMOTE_NODE="community.rino.io"
|
||||
FUNDING_DELAY="1"
|
||||
FUNDING_AMOUNT=".01"
|
||||
TERMINAL_TAB_DELAY="10"
|
||||
END_COLLECTION_EPOCH_DATE="1656637261" # Must be in epoch time (July 1st)
|
||||
REMOTE_NODE="community.rino.io" # Remote node to send transactions to the network
|
||||
FUNDING_DELAY="1" # Time inbetween funding wallets ( If this value is too low your funding wallet could have issues with the 20 minute lock)
|
||||
FUNDING_AMOUNT=".01" # The amount to send to each wallet created
|
||||
TMUX_WINDOW_DELAY="10" # The delay inbetween launching a new wallet
|
||||
END_COLLECTION_EPOCH_DATE="1656637261" # Must be in epoch time (July 1st) The time when collection should stop
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#############################################################################
|
||||
# You shouldn't need to edit anything below this line #
|
||||
@@ -241,8 +247,8 @@ EOL
|
||||
echo -e '\033[34mSpawned new tmux window: \033[0m' "${walletAddr}"
|
||||
tmux new-window -t run-sh: "python3 ../../spawn.py ${walletName}"
|
||||
# A delay of opening a new tab to not overload the server. Most wallets will have to scan the network for a while before transacting
|
||||
echo -e '\033[34mSleeping for: \033[0m\t\t ' $TERMINAL_TAB_DELAY ' seconds'
|
||||
sleep $TERMINAL_TAB_DELAY
|
||||
echo -e '\033[34mSleeping for: \033[0m\t\t ' $TMUX_WINDOW_DELAY ' seconds'
|
||||
sleep $TMUX_WINDOW_DELAY
|
||||
done < <(find ./ -type f -name "*.txt" | sort -u)
|
||||
cd - || exit # Reset the directory
|
||||
done < <(find ./Wallets -mindepth 1 -type d | sort -u)
|
||||
|
||||
14
spawn.py
14
spawn.py
@@ -1,11 +1,15 @@
|
||||
from sys import argv
|
||||
from math import exp
|
||||
from numpy import random
|
||||
from time import sleep, time
|
||||
from math import exp
|
||||
from datetime import datetime, timedelta
|
||||
from subprocess import Popen, PIPE, TimeoutExpired
|
||||
from sys import argv
|
||||
from colorama import Fore, Style
|
||||
from os import getenv, getcwd, system
|
||||
from datetime import datetime, timedelta
|
||||
from subprocess import Popen, PIPE, TimeoutExpired
|
||||
|
||||
"""
|
||||
Description: The script that each wallet runs to transact in the tmux windows.
|
||||
"""
|
||||
|
||||
NETWORK = getenv('RUN_SH_NETWORK')
|
||||
END_COLLECTION_EPOCH_DATE = getenv('END_COLLECTION_EPOCH_DATE')
|
||||
@@ -13,7 +17,7 @@ END_COLLECTION_EPOCH_DATE = getenv('END_COLLECTION_EPOCH_DATE')
|
||||
|
||||
def runcommand(cmd):
|
||||
"""
|
||||
|
||||
Function to make syscalls
|
||||
:param cmd:
|
||||
:return:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user