mirror of
https://github.com/MPCStats/zk-stats-lib.git
synced 2026-01-09 13:38:02 -05:00
382 lines
14 KiB
Python
382 lines
14 KiB
Python
from typing import Type, Sequence, Mapping, Union, Literal
|
|
import torch
|
|
import ezkl
|
|
import os
|
|
import numpy as np
|
|
import json
|
|
import time
|
|
|
|
from zkstats.computation import IModel
|
|
|
|
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
def verifier_define_calculation(
|
|
dummy_data_path: str,
|
|
selected_columns: list[str],
|
|
# TODO: Here dummy_sel_data_path is redundant, but here to use process_data
|
|
dummy_sel_data_path: str,
|
|
verifier_model: Type[IModel],
|
|
verifier_model_path: str,
|
|
) -> None:
|
|
"""
|
|
Export the verifier model to an ONNX file.
|
|
:param dummy_data_path: path to the dummy data file
|
|
:param selected_columns: column names selected for computation
|
|
:param dummy_sel_data_path: path to store generated preprocessed dummy data file
|
|
:param verifier_model: the verifier model class
|
|
:param verifier_model_path: path to store the generated verifier model file in onnx format
|
|
"""
|
|
dummy_data_tensor_array = _process_data(dummy_data_path, selected_columns, dummy_sel_data_path)
|
|
# export onnx file
|
|
_export_onnx(verifier_model, dummy_data_tensor_array, verifier_model_path)
|
|
|
|
|
|
# TODO: Should only need the shape of data instead of the real dataset, since
|
|
# users (verifiers) call this function and they don't have the real data.
|
|
def create_dummy(data_path: str, dummy_data_path: str) -> None:
|
|
"""
|
|
Create a dummy data file with randomized data based on the shape of the original data.
|
|
"""
|
|
data = json.loads(open(data_path, "r").read())
|
|
# assume all columns have same number of rows
|
|
dummy_data ={}
|
|
for col in data:
|
|
# not use same value for every column to prevent something weird, like singular matrix
|
|
dummy_data[col] = np.round(np.random.uniform(1,30,len(data[col])),1).tolist()
|
|
|
|
json.dump(dummy_data, open(dummy_data_path, 'w'))
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
|
|
def prover_gen_settings(
|
|
data_path: str,
|
|
selected_columns: list[str],
|
|
sel_data_path: list[str],
|
|
prover_model: Type[IModel],
|
|
prover_model_path: str,
|
|
scale: Union[list[int], Literal["default"]],
|
|
# TODO: should be able to hardcode mode to "resources" or make it default?
|
|
mode: Union[Literal["resources"], Literal["accuracy"]],
|
|
settings_path: str,
|
|
):
|
|
"""
|
|
Generate and calibrate settings for the given model and data.
|
|
:param data_path: path to the data file
|
|
:param selected_columns: column names selected for computation
|
|
:param sel_data_path: path to store generated preprocessed data file
|
|
:param prover_model: the prover model class
|
|
:param prover_model_path: path to store the generated prover model file in onnx format
|
|
:param scale: the scale to use for the computation. It's a list of integer or "default" for default scale
|
|
:param mode: the mode to use for the computation. It's either "resources" or "accuracy"
|
|
:param settings_path: path to store the generated settings file
|
|
"""
|
|
data_tensor_array = _process_data(data_path, selected_columns, sel_data_path)
|
|
|
|
# export onnx file
|
|
_export_onnx(prover_model, data_tensor_array, prover_model_path)
|
|
# gen + calibrate setting
|
|
_gen_settings(sel_data_path, prover_model_path, scale, mode, settings_path)
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
def setup(
|
|
model_path: str,
|
|
compiled_model_path: str,
|
|
settings_path: str,
|
|
vk_path: str,
|
|
pk_path: str,
|
|
) -> None:
|
|
"""
|
|
Compile the verifier model and generate the verification key and public key.
|
|
|
|
:param model_path: path to the model file in onnx format
|
|
:param compiled_model_path: path to store the generated compiled verifier model
|
|
:param settings_path: path to the settings file
|
|
:param vk_path: path to store the generated verification key file
|
|
:param pk_path: path to store the generated public key file
|
|
"""
|
|
# compile circuit
|
|
res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
|
|
assert res == True
|
|
|
|
# srs path
|
|
res = ezkl.get_srs(settings_path)
|
|
|
|
# setup vk, pk param for use..... prover can use same pk or can init their own!
|
|
print("==== setting up ezkl ====")
|
|
start_time = time.time()
|
|
res = ezkl.setup(
|
|
compiled_model_path,
|
|
vk_path,
|
|
pk_path)
|
|
end_time = time.time()
|
|
time_setup = end_time -start_time
|
|
print(f"Time setup: {time_setup} seconds")
|
|
|
|
assert res == True
|
|
assert os.path.isfile(vk_path)
|
|
assert os.path.isfile(pk_path)
|
|
assert os.path.isfile(settings_path)
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
def prover_gen_proof(
|
|
prover_model_path: str,
|
|
sel_data_path: str,
|
|
witness_path: str,
|
|
prover_compiled_model_path: str,
|
|
settings_path: str,
|
|
proof_path: str,
|
|
pk_path: str,
|
|
) -> None:
|
|
"""
|
|
Generate a proof for the given model and data.
|
|
|
|
:param prover_model_path: path to the prover model file in onnx format
|
|
:param sel_data_path: path to the preprocessed data file
|
|
:param witness_path: path to store the generated witness file
|
|
:param prover_compiled_model_path: path to store the generated compiled prover model
|
|
:param settings_path: path to the settings file
|
|
:param proof_path: path to store the generated proof file
|
|
:param pk_path: path to the public key file
|
|
"""
|
|
res = ezkl.compile_circuit(prover_model_path, prover_compiled_model_path, settings_path)
|
|
assert res == True
|
|
# now generate the witness file
|
|
print('==== Generating Witness ====')
|
|
witness = ezkl.gen_witness(sel_data_path, prover_compiled_model_path, witness_path)
|
|
assert os.path.isfile(witness_path)
|
|
# print(witness["outputs"])
|
|
settings = json.load(open(settings_path))
|
|
output_scale = settings['model_output_scales']
|
|
print("witness boolean: ", ezkl.vecu64_to_float(witness['outputs'][0][0], output_scale[0]))
|
|
for i in range(len(witness['outputs'][1])):
|
|
print("witness result", i+1,":", ezkl.vecu64_to_float(witness['outputs'][1][i], output_scale[1]))
|
|
|
|
# GENERATE A PROOF
|
|
print("==== Generating Proof ====")
|
|
start_time = time.time()
|
|
res = ezkl.prove(
|
|
witness_path,
|
|
prover_compiled_model_path,
|
|
pk_path,
|
|
proof_path,
|
|
"single",
|
|
)
|
|
|
|
print("proof: " ,res)
|
|
end_time = time.time()
|
|
time_gen_prf = end_time -start_time
|
|
print(f"Time gen prf: {time_gen_prf} seconds")
|
|
assert os.path.isfile(proof_path)
|
|
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
# commitment_map is a mapping[column_name, commitment_hex]
|
|
# E.g. {
|
|
# "columns_0": "0x...",
|
|
# ...
|
|
# }
|
|
TCommitmentMap = Mapping[str, str]
|
|
# commitment_maps is a mapping[scale, mapping[column_name, commitment_hex]]
|
|
# E.g. {
|
|
# scale_0: {
|
|
# "columns_0": "0x...",
|
|
# ...
|
|
# },
|
|
# ...
|
|
# }
|
|
TCommitmentMaps = Mapping[str, TCommitmentMap]
|
|
|
|
def verifier_verify(proof_path: str, settings_path: str, vk_path: str, selected_columns: Sequence[str], commitment_maps: TCommitmentMaps) -> torch.Tensor:
|
|
"""
|
|
Verify the proof and return the result.
|
|
|
|
:param proof_path: path to the proof file
|
|
:param settings_path: path to the settings file
|
|
:param vk_path: path to the verification key file
|
|
:param expected_data_commitments: expected data commitments for columns. The i-th commitment should
|
|
be stored in `expected_data_commitments[i]`.
|
|
"""
|
|
|
|
# 1. First check the zk proof is valid
|
|
res = ezkl.verify(
|
|
proof_path,
|
|
settings_path,
|
|
vk_path,
|
|
)
|
|
# TODO: change asserts to return boolean
|
|
assert res == True
|
|
|
|
# 2. Check if input/output are correct
|
|
with open(settings_path) as f:
|
|
settings = json.load(f)
|
|
input_scales = settings['model_input_scales']
|
|
output_scales = settings['model_output_scales']
|
|
with open(proof_path) as f:
|
|
proof = json.load(f)
|
|
proof_instance = proof["instances"][0]
|
|
inputs = proof_instance[:len(input_scales)]
|
|
outputs = proof_instance[len(input_scales):]
|
|
len_inputs = len(inputs)
|
|
len_outputs = len(outputs)
|
|
# `instances` = input commitments + params (which is 0 in our case) + output
|
|
assert len(proof_instance) == len_inputs + len_outputs, f"lengths mismatch: {len(proof_instance)=}, {len_inputs=}, {len_outputs=}"
|
|
|
|
# 2.1 Check input commitments
|
|
# All inputs are hashed so are commitments
|
|
assert len_inputs == len(selected_columns), f"lengths mismatch: {len_inputs=}, {len(selected_columns)=}"
|
|
# Sanity check
|
|
# Check each commitment is correct
|
|
for i, (actual_commitment, column_name) in enumerate(zip(inputs, selected_columns)):
|
|
actual_commitment_str = ezkl.vecu64_to_felt(actual_commitment)
|
|
input_scale = input_scales[i]
|
|
expected_commitment = commitment_maps[str(input_scale)][column_name]
|
|
assert actual_commitment_str == expected_commitment, f"commitment mismatch: {i=}, {actual_commitment_str=}, {expected_commitment=}"
|
|
|
|
# 2.2 Check output is correct
|
|
# - is a tuple (is_in_error, result)
|
|
# - is_valid is True
|
|
# Sanity check
|
|
is_in_error = ezkl.vecu64_to_float(outputs[0], output_scales[0])
|
|
assert is_in_error == 1.0, f"result is not within error"
|
|
return ezkl.vecu64_to_float(outputs[1], output_scales[1])
|
|
|
|
|
|
# ===================================================================================================
|
|
# ===================================================================================================
|
|
|
|
def get_data_commitment_maps(data_path: str, scales: Sequence[int]) -> TCommitmentMaps:
|
|
"""
|
|
Generate a data commitment map for each scale. Commitments for different scales are required
|
|
so that verifiers can verify proofs with different scales.
|
|
|
|
:param data_path: path to the data file. The data file should be a JSON file with the following format:
|
|
{
|
|
"column_0": [number_0, number_1, ...],
|
|
"column_1": [number_0, number_1, ...],
|
|
}
|
|
:param scales: a list of scales to use for the commitments.
|
|
:return: a map from scale to column name to commitment.
|
|
"""
|
|
with open(data_path) as f:
|
|
data_json = json.load(f)
|
|
return {
|
|
str(scale): {
|
|
k: _get_commitment_for_column(v, scale) for k, v in data_json.items()
|
|
} for scale in scales
|
|
}
|
|
|
|
|
|
# ===================================================================================================
|
|
# Private functions
|
|
# ===================================================================================================
|
|
|
|
def _export_onnx(model: Type[IModel], data_tensor_array: list[torch.Tensor], model_loc: str) -> None:
|
|
circuit = model()
|
|
try:
|
|
circuit.preprocess(data_tensor_array)
|
|
except AttributeError:
|
|
pass
|
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
|
# print(device)
|
|
|
|
circuit.to(device)
|
|
|
|
# Flips the neural net into inference mode
|
|
circuit.eval()
|
|
input_names = []
|
|
dynamic_axes = {}
|
|
|
|
data_tensor_tuple = ()
|
|
for i in range(len(data_tensor_array)):
|
|
data_tensor_tuple += (data_tensor_array[i],)
|
|
input_index = "input"+str(i+1)
|
|
input_names.append(input_index)
|
|
dynamic_axes[input_index] = {0 : 'batch_size'}
|
|
dynamic_axes["output"] = {0 : 'batch_size'}
|
|
|
|
# Export the model
|
|
torch.onnx.export(circuit, # model being run
|
|
data_tensor_tuple, # model input (or a tuple for multiple inputs)
|
|
model_loc, # where to save the model (can be a file or file-like object)
|
|
export_params=True, # store the trained parameter weights inside the model file
|
|
opset_version=11, # the ONNX version to export the model to
|
|
do_constant_folding=True, # whether to execute constant folding for optimization
|
|
input_names = input_names, # the model's input names
|
|
output_names = ['output'], # the model's output names
|
|
dynamic_axes=dynamic_axes)
|
|
|
|
|
|
# mode is either "accuracy" or "resources"
|
|
# sel_data = selected column from data that will be used for computation
|
|
def _gen_settings(
|
|
sel_data_path: str,
|
|
onnx_filename: str,
|
|
scale: Union[list[int], Literal["default"]],
|
|
mode: Union[Literal["resources"], Literal["accuracy"]],
|
|
settings_filename: str,
|
|
) -> None:
|
|
print("==== Generate & Calibrate Setting ====")
|
|
# Set input to be Poseidon Hash, and param of computation graph to be public
|
|
# Poseidon is not homomorphic additive, maybe consider Pedersens or Dory commitment.
|
|
gip_run_args = ezkl.PyRunArgs()
|
|
gip_run_args.input_visibility = "hashed" # one commitment (values hashed) for each column
|
|
gip_run_args.param_visibility = "private" # no parameters shown
|
|
gip_run_args.output_visibility = "public" # should be `(torch.Tensor(1.0), output)`
|
|
|
|
# generate settings
|
|
ezkl.gen_settings(onnx_filename, settings_filename, py_run_args=gip_run_args)
|
|
if scale =="default":
|
|
ezkl.calibrate_settings(
|
|
sel_data_path, onnx_filename, settings_filename, mode)
|
|
else:
|
|
assert isinstance(scale, list)
|
|
ezkl.calibrate_settings(
|
|
sel_data_path, onnx_filename, settings_filename, mode, scales = scale)
|
|
|
|
assert os.path.exists(settings_filename)
|
|
assert os.path.exists(sel_data_path)
|
|
assert os.path.exists(onnx_filename)
|
|
f_setting = open(settings_filename, "r")
|
|
print("scale: ", scale)
|
|
print("setting: ", f_setting.read())
|
|
|
|
|
|
def _process_data(
|
|
data_path: str,
|
|
col_array: list[str],
|
|
sel_data_path: list[str],
|
|
) -> list[torch.Tensor]:
|
|
data_tensor_array=[]
|
|
sel_data = []
|
|
data_onefile = json.loads(open(data_path, "r").read())
|
|
|
|
for col in col_array:
|
|
data = data_onefile[col]
|
|
data_tensor = torch.tensor(data, dtype = torch.float32)
|
|
data_tensor_array.append(torch.reshape(data_tensor, (1,-1,1)))
|
|
sel_data.append(data)
|
|
# Serialize data into file:
|
|
# sel_data comes from `data`
|
|
json.dump(dict(input_data = sel_data), open(sel_data_path, 'w'))
|
|
return data_tensor_array
|
|
|
|
|
|
def _get_commitment_for_column(column: list[float], scale: int) -> str:
|
|
# Ref: https://github.com/zkonduit/ezkl/discussions/633
|
|
serialized_data = [ezkl.float_to_vecu64(x, scale) for x in column]
|
|
res_poseidon_hash = ezkl.poseidon_hash(serialized_data)
|
|
res_hex = ezkl.vecu64_to_felt(res_poseidon_hash[0])
|
|
return res_hex |