feat: convert data file to json in core lib

This commit is contained in:
mhchia
2024-04-25 20:30:25 +08:00
parent a35c0af4b5
commit 7704bf6e6e
3 changed files with 119 additions and 20 deletions

View File

@@ -16,7 +16,7 @@ ERROR_CIRCUIT_STRICT = 0.0001
ERROR_CIRCUIT_RELAXED = 0.1
def data_to_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]:
def data_to_json_file(data_path: Path, data: list[torch.Tensor]) -> dict[str, list]:
column_names = [f"columns_{i}" for i in range(len(data))]
column_to_data = {
column: d.tolist()
@@ -45,7 +45,7 @@ def compute(
data_path = basepath / "data.json"
data_commitment_path = basepath / "commitments.json"
column_to_data = data_to_file(data_path, data)
column_to_data = data_to_json_file(data_path, data)
# If selected_columns_params is None, select all columns
if selected_columns_params is None:
selected_columns = list(column_to_data.keys())

View File

@@ -2,10 +2,10 @@ import json
import torch
from zkstats.core import generate_data_commitment
from zkstats.core import generate_data_commitment, _preprocess_data_file_to_json
from zkstats.computation import computation_to_model
from .helpers import data_to_file, compute
from .helpers import data_to_json_file, compute
def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales):
@@ -16,7 +16,7 @@ def test_get_data_commitment_maps(tmp_path, column_0, column_1, scales):
# "columns_0": [1, 2, 3, 4, 5],
# "columns_1": [6, 7, 8, 9, 10],
# }
data_json = data_to_file(data_path, [column_0, column_1])
data_json = data_to_json_file(data_path, [column_0, column_1])
# data_commitment is a mapping[scale -> mapping[column_name, commitment_hex]]
# {
# scale_0: {
@@ -51,7 +51,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path):
data_commitment_path = tmp_path / "commitments.json"
column_0 = torch.tensor([3.0, 4.5, 1.0, 2.0, 7.5, 6.4, 5.5])
column_1 = torch.tensor([2.7, 3.3, 1.1, 2.2, 3.8, 8.2, 4.4])
data_to_file(data_path, [column_0, column_1])
data_to_json_file(data_path, [column_0, column_1])
scales = [2, 3]
generate_data_commitment(data_path, scales, data_commitment_path)
with open(data_commitment_path, "r") as f:
@@ -63,7 +63,7 @@ def test_get_data_commitment_maps_hardcoded(tmp_path):
def test_integration_select_partial_columns(tmp_path, column_0, column_1, error, scales):
data_path = tmp_path / "data.json"
data_json = data_to_file(data_path, [column_0, column_1])
data_json = data_to_json_file(data_path, [column_0, column_1])
columns = list(data_json.keys())
assert len(columns) == 2
# Select only the first column from two columns
@@ -75,3 +75,45 @@ def test_integration_select_partial_columns(tmp_path, column_0, column_1, error,
_, model = computation_to_model(simple_computation, error)
# gen settings, setup, prove, verify
compute(tmp_path, [column_0, column_1], model, scales, selected_columns)
def json_file_to_csv(data_json_path, data_csv_path):
with open(data_json_path, "r") as f:
data_from_json = json.load(f)
# Generate csv file from json
column_names = list(data_from_json.keys())
len_columns = len(data_from_json[column_names[0]])
for column in column_names:
assert len(data_from_json[column]) == len_columns, "All columns should have the same length"
rows = [
[str(data_from_json[column][i]) for column in column_names]
for i in range(len_columns)
]
with open(data_csv_path, "w") as f:
f.write(",".join(column_names) + "\n")
for row in rows:
f.write(",".join(row) + "\n")
def test__preprocess_data_file_to_json(tmp_path, column_0, column_1):
data_json_path = tmp_path / "data.json"
data_from_json = data_to_json_file(data_json_path, [column_0, column_1])
# Test: csv can be converted to json
# 1. Generate a csv file from json
data_csv_path = tmp_path / "data.csv"
json_file_to_csv(data_json_path, data_csv_path)
# 2. Convert csv to json
data_from_csv_json_path = tmp_path / "data_from_csv.json"
_preprocess_data_file_to_json(data_csv_path, data_from_csv_json_path)
with open(data_from_csv_json_path, "r") as f:
data_from_csv = json.load(f)
# 3. Compare the two json files
assert data_from_csv == data_from_json
# Test: this function can also handle json format by just copying the file
new_data_json_path = tmp_path / "new_data.json"
_preprocess_data_file_to_json(data_json_path, new_data_json_path)
with open(new_data_json_path, "r") as f:
new_data_from_json = json.load(f)
assert new_data_from_json == data_from_json

View File

@@ -1,11 +1,15 @@
from typing import Type, Sequence, Mapping, Union, Literal
import torch
import ezkl
import csv
from pathlib import Path
from typing import Type, Sequence, Mapping, Union, Literal, Callable
from enum import Enum
import os
import numpy as np
import json
import time
import torch
import ezkl
from zkstats.computation import IModel
@@ -40,7 +44,11 @@ def create_dummy(data_path: str, dummy_data_path: str) -> None:
"""
Create a dummy data file with randomized data based on the shape of the original data.
"""
data = json.loads(open(data_path, "r").read())
# Convert data file to json under the same directory but with suffix .json
data_path: Path = Path(data_path)
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)
data = json.loads(open(data_json_path, "r").read())
# assume all columns have same number of rows
dummy_data ={}
for col in data:
@@ -270,16 +278,17 @@ def generate_data_commitment(data_path: str, scales: Sequence[int], data_commitm
Generate and store data commitment maps for different scales so that verifiers can verify
proofs with different scales.
:param data_path: path to the data file. The data file should be a JSON file with the following format:
{
"column_0": [number_0, number_1, ...],
"column_1": [number_0, number_1, ...],
}
:param data_path: data file path. The format must be anything defined in `DataExtension`
:param scales: a list of scales to use for the commitments
:param data_commitment_path: path to store the generated data commitment maps
"""
with open(data_path) as f:
# Convert `data_path` to json file `data_json_path`
data_path: Path = Path(data_path)
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)
_preprocess_data_file_to_json(data_path, data_json_path)
with open(data_json_path) as f:
data_json = json.load(f)
data_commitments = {
str(scale): {
@@ -367,14 +376,62 @@ def _gen_settings(
print("setting: ", f_setting.read())
def _csv_file_to_json(old_file_path: Union[Path, str], out_data_json_path: Union[Path, str], *, delimiter: str = ",") -> None:
data_csv_path = Path(old_file_path)
with open(data_csv_path, 'r') as f_csv:
reader = csv.reader(f_csv, delimiter=delimiter, strict=True)
# Read all data from the reader to `rows`
rows_with_column_name = tuple(reader)
if len(rows_with_column_name) < 1:
raise ValueError("No column names in the CSV file")
if len(rows_with_column_name) < 2:
raise ValueError("No data in the CSV file")
column_names = rows_with_column_name[0]
rows = rows_with_column_name[1:]
columns = [
[
float(rows[j][i])
for j in range(len(rows))
]
for i in range(len(rows[0]))
]
data = {
column_name: column_data
for column_name, column_data in zip(column_names, columns)
}
with open(out_data_json_path, "w") as f_json:
json.dump(data, f_json)
class DataExtension(Enum):
CSV = ".csv"
JSON = ".json"
DATA_FORMAT_PREPROCESSING_FUNCTION: dict[DataExtension, Callable[[Union[Path, str], Path], None]] = {
DataExtension.CSV: _csv_file_to_json,
DataExtension.JSON: lambda old_file_path, out_data_json_path: Path(out_data_json_path).write_text(Path(old_file_path).read_text())
}
def _preprocess_data_file_to_json(data_path: Union[Path, str], out_data_json_path: Path):
data_file_extension = DataExtension(data_path.suffix)
preprocess_function = DATA_FORMAT_PREPROCESSING_FUNCTION[data_file_extension]
preprocess_function(data_path, out_data_json_path)
def _process_data(
data_path: str,
data_path: Union[str | Path],
col_array: list[str],
sel_data_path: list[str],
) -> list[torch.Tensor]:
data_tensor_array=[]
sel_data = []
data_onefile = json.loads(open(data_path, "r").read())
data_path: Path = Path(data_path)
# Convert data file to json under the same directory but with suffix .json
data_json_path = Path(data_path).with_suffix(DataExtension.JSON.value)
_preprocess_data_file_to_json(data_path, data_json_path)
data_onefile = json.loads(open(data_json_path, "r").read())
for col in col_array:
data = data_onefile[col]
@@ -394,4 +451,4 @@ def _get_commitment_for_column(column: list[float], scale: int) -> str:
res_poseidon_hash = ezkl.poseidon_hash(serialized_data)[0]
# res_hex = ezkl.vecu64_to_felt(res_poseidon_hash[0])
return res_poseidon_hash
return res_poseidon_hash