Files
zk-stats-lib/zkstats/core.py
2024-02-29 18:08:53 +07:00

382 lines
14 KiB
Python

from typing import Type, Sequence, Mapping, Union, Literal
import torch
import ezkl
import os
import numpy as np
import json
import time
from zkstats.computation import IModel
# ===================================================================================================
# ===================================================================================================
def verifier_define_calculation(
dummy_data_path: str,
selected_columns: list[str],
# TODO: Here dummy_sel_data_path is redundant, but here to use process_data
dummy_sel_data_path: str,
verifier_model: Type[IModel],
verifier_model_path: str,
) -> None:
"""
Export the verifier model to an ONNX file.
:param dummy_data_path: path to the dummy data file
:param selected_columns: column names selected for computation
:param dummy_sel_data_path: path to store generated preprocessed dummy data file
:param verifier_model: the verifier model class
:param verifier_model_path: path to store the generated verifier model file in onnx format
"""
dummy_data_tensor_array = _process_data(dummy_data_path, selected_columns, dummy_sel_data_path)
# export onnx file
_export_onnx(verifier_model, dummy_data_tensor_array, verifier_model_path)
# TODO: Should only need the shape of data instead of the real dataset, since
# users (verifiers) call this function and they don't have the real data.
def create_dummy(data_path: str, dummy_data_path: str) -> None:
"""
Create a dummy data file with randomized data based on the shape of the original data.
"""
data = json.loads(open(data_path, "r").read())
# assume all columns have same number of rows
dummy_data ={}
for col in data:
# not use same value for every column to prevent something weird, like singular matrix
dummy_data[col] = np.round(np.random.uniform(1,30,len(data[col])),1).tolist()
json.dump(dummy_data, open(dummy_data_path, 'w'))
# ===================================================================================================
# ===================================================================================================
def prover_gen_settings(
data_path: str,
selected_columns: list[str],
sel_data_path: list[str],
prover_model: Type[IModel],
prover_model_path: str,
scale: Union[list[int], Literal["default"]],
# TODO: should be able to hardcode mode to "resources" or make it default?
mode: Union[Literal["resources"], Literal["accuracy"]],
settings_path: str,
):
"""
Generate and calibrate settings for the given model and data.
:param data_path: path to the data file
:param selected_columns: column names selected for computation
:param sel_data_path: path to store generated preprocessed data file
:param prover_model: the prover model class
:param prover_model_path: path to store the generated prover model file in onnx format
:param scale: the scale to use for the computation. It's a list of integer or "default" for default scale
:param mode: the mode to use for the computation. It's either "resources" or "accuracy"
:param settings_path: path to store the generated settings file
"""
data_tensor_array = _process_data(data_path, selected_columns, sel_data_path)
# export onnx file
_export_onnx(prover_model, data_tensor_array, prover_model_path)
# gen + calibrate setting
_gen_settings(sel_data_path, prover_model_path, scale, mode, settings_path)
# ===================================================================================================
# ===================================================================================================
def setup(
model_path: str,
compiled_model_path: str,
settings_path: str,
vk_path: str,
pk_path: str,
) -> None:
"""
Compile the verifier model and generate the verification key and public key.
:param model_path: path to the model file in onnx format
:param compiled_model_path: path to store the generated compiled verifier model
:param settings_path: path to the settings file
:param vk_path: path to store the generated verification key file
:param pk_path: path to store the generated public key file
"""
# compile circuit
res = ezkl.compile_circuit(model_path, compiled_model_path, settings_path)
assert res == True
# srs path
res = ezkl.get_srs(settings_path)
# setup vk, pk param for use..... prover can use same pk or can init their own!
print("==== setting up ezkl ====")
start_time = time.time()
res = ezkl.setup(
compiled_model_path,
vk_path,
pk_path)
end_time = time.time()
time_setup = end_time -start_time
print(f"Time setup: {time_setup} seconds")
assert res == True
assert os.path.isfile(vk_path)
assert os.path.isfile(pk_path)
assert os.path.isfile(settings_path)
# ===================================================================================================
# ===================================================================================================
def prover_gen_proof(
prover_model_path: str,
sel_data_path: str,
witness_path: str,
prover_compiled_model_path: str,
settings_path: str,
proof_path: str,
pk_path: str,
) -> None:
"""
Generate a proof for the given model and data.
:param prover_model_path: path to the prover model file in onnx format
:param sel_data_path: path to the preprocessed data file
:param witness_path: path to store the generated witness file
:param prover_compiled_model_path: path to store the generated compiled prover model
:param settings_path: path to the settings file
:param proof_path: path to store the generated proof file
:param pk_path: path to the public key file
"""
res = ezkl.compile_circuit(prover_model_path, prover_compiled_model_path, settings_path)
assert res == True
# now generate the witness file
print('==== Generating Witness ====')
witness = ezkl.gen_witness(sel_data_path, prover_compiled_model_path, witness_path)
assert os.path.isfile(witness_path)
# print(witness["outputs"])
settings = json.load(open(settings_path))
output_scale = settings['model_output_scales']
print("witness boolean: ", ezkl.vecu64_to_float(witness['outputs'][0][0], output_scale[0]))
for i in range(len(witness['outputs'][1])):
print("witness result", i+1,":", ezkl.vecu64_to_float(witness['outputs'][1][i], output_scale[1]))
# GENERATE A PROOF
print("==== Generating Proof ====")
start_time = time.time()
res = ezkl.prove(
witness_path,
prover_compiled_model_path,
pk_path,
proof_path,
"single",
)
print("proof: " ,res)
end_time = time.time()
time_gen_prf = end_time -start_time
print(f"Time gen prf: {time_gen_prf} seconds")
assert os.path.isfile(proof_path)
# ===================================================================================================
# ===================================================================================================
# commitment_map is a mapping[column_name, commitment_hex]
# E.g. {
# "columns_0": "0x...",
# ...
# }
TCommitmentMap = Mapping[str, str]
# commitment_maps is a mapping[scale, mapping[column_name, commitment_hex]]
# E.g. {
# scale_0: {
# "columns_0": "0x...",
# ...
# },
# ...
# }
TCommitmentMaps = Mapping[str, TCommitmentMap]
def verifier_verify(proof_path: str, settings_path: str, vk_path: str, selected_columns: Sequence[str], commitment_maps: TCommitmentMaps) -> torch.Tensor:
"""
Verify the proof and return the result.
:param proof_path: path to the proof file
:param settings_path: path to the settings file
:param vk_path: path to the verification key file
:param expected_data_commitments: expected data commitments for columns. The i-th commitment should
be stored in `expected_data_commitments[i]`.
"""
# 1. First check the zk proof is valid
res = ezkl.verify(
proof_path,
settings_path,
vk_path,
)
# TODO: change asserts to return boolean
assert res == True
# 2. Check if input/output are correct
with open(settings_path) as f:
settings = json.load(f)
input_scales = settings['model_input_scales']
output_scales = settings['model_output_scales']
with open(proof_path) as f:
proof = json.load(f)
proof_instance = proof["instances"][0]
inputs = proof_instance[:len(input_scales)]
outputs = proof_instance[len(input_scales):]
len_inputs = len(inputs)
len_outputs = len(outputs)
# `instances` = input commitments + params (which is 0 in our case) + output
assert len(proof_instance) == len_inputs + len_outputs, f"lengths mismatch: {len(proof_instance)=}, {len_inputs=}, {len_outputs=}"
# 2.1 Check input commitments
# All inputs are hashed so are commitments
assert len_inputs == len(selected_columns), f"lengths mismatch: {len_inputs=}, {len(selected_columns)=}"
# Sanity check
# Check each commitment is correct
for i, (actual_commitment, column_name) in enumerate(zip(inputs, selected_columns)):
actual_commitment_str = ezkl.vecu64_to_felt(actual_commitment)
input_scale = input_scales[i]
expected_commitment = commitment_maps[str(input_scale)][column_name]
assert actual_commitment_str == expected_commitment, f"commitment mismatch: {i=}, {actual_commitment_str=}, {expected_commitment=}"
# 2.2 Check output is correct
# - is a tuple (is_in_error, result)
# - is_valid is True
# Sanity check
is_in_error = ezkl.vecu64_to_float(outputs[0], output_scales[0])
assert is_in_error == 1.0, f"result is not within error"
return ezkl.vecu64_to_float(outputs[1], output_scales[1])
# ===================================================================================================
# ===================================================================================================
def get_data_commitment_maps(data_path: str, scales: Sequence[int]) -> TCommitmentMaps:
"""
Generate a data commitment map for each scale. Commitments for different scales are required
so that verifiers can verify proofs with different scales.
:param data_path: path to the data file. The data file should be a JSON file with the following format:
{
"column_0": [number_0, number_1, ...],
"column_1": [number_0, number_1, ...],
}
:param scales: a list of scales to use for the commitments.
:return: a map from scale to column name to commitment.
"""
with open(data_path) as f:
data_json = json.load(f)
return {
str(scale): {
k: _get_commitment_for_column(v, scale) for k, v in data_json.items()
} for scale in scales
}
# ===================================================================================================
# Private functions
# ===================================================================================================
def _export_onnx(model: Type[IModel], data_tensor_array: list[torch.Tensor], model_loc: str) -> None:
circuit = model()
try:
circuit.preprocess(data_tensor_array)
except AttributeError:
pass
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)
circuit.to(device)
# Flips the neural net into inference mode
circuit.eval()
input_names = []
dynamic_axes = {}
data_tensor_tuple = ()
for i in range(len(data_tensor_array)):
data_tensor_tuple += (data_tensor_array[i],)
input_index = "input"+str(i+1)
input_names.append(input_index)
dynamic_axes[input_index] = {0 : 'batch_size'}
dynamic_axes["output"] = {0 : 'batch_size'}
# Export the model
torch.onnx.export(circuit, # model being run
data_tensor_tuple, # model input (or a tuple for multiple inputs)
model_loc, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=11, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = input_names, # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes=dynamic_axes)
# mode is either "accuracy" or "resources"
# sel_data = selected column from data that will be used for computation
def _gen_settings(
sel_data_path: str,
onnx_filename: str,
scale: Union[list[int], Literal["default"]],
mode: Union[Literal["resources"], Literal["accuracy"]],
settings_filename: str,
) -> None:
print("==== Generate & Calibrate Setting ====")
# Set input to be Poseidon Hash, and param of computation graph to be public
# Poseidon is not homomorphic additive, maybe consider Pedersens or Dory commitment.
gip_run_args = ezkl.PyRunArgs()
gip_run_args.input_visibility = "hashed" # one commitment (values hashed) for each column
gip_run_args.param_visibility = "private" # no parameters shown
gip_run_args.output_visibility = "public" # should be `(torch.Tensor(1.0), output)`
# generate settings
ezkl.gen_settings(onnx_filename, settings_filename, py_run_args=gip_run_args)
if scale =="default":
ezkl.calibrate_settings(
sel_data_path, onnx_filename, settings_filename, mode)
else:
assert isinstance(scale, list)
ezkl.calibrate_settings(
sel_data_path, onnx_filename, settings_filename, mode, scales = scale)
assert os.path.exists(settings_filename)
assert os.path.exists(sel_data_path)
assert os.path.exists(onnx_filename)
f_setting = open(settings_filename, "r")
print("scale: ", scale)
print("setting: ", f_setting.read())
def _process_data(
data_path: str,
col_array: list[str],
sel_data_path: list[str],
) -> list[torch.Tensor]:
data_tensor_array=[]
sel_data = []
data_onefile = json.loads(open(data_path, "r").read())
for col in col_array:
data = data_onefile[col]
data_tensor = torch.tensor(data, dtype = torch.float32)
data_tensor_array.append(torch.reshape(data_tensor, (1,-1,1)))
sel_data.append(data)
# Serialize data into file:
# sel_data comes from `data`
json.dump(dict(input_data = sel_data), open(sel_data_path, 'w'))
return data_tensor_array
def _get_commitment_for_column(column: list[float], scale: int) -> str:
# Ref: https://github.com/zkonduit/ezkl/discussions/633
serialized_data = [ezkl.float_to_vecu64(x, scale) for x in column]
res_poseidon_hash = ezkl.poseidon_hash(serialized_data)
res_hex = ezkl.vecu64_to_felt(res_poseidon_hash[0])
return res_hex