clarify mode output first num, import zkstats, comment mode/median

2026-01-09 21:48:10 -05:00 · 2024-03-03 13:46:34 +07:00
parent f098f09749
commit 4537a77c43
16 changed files with 580 additions and 1194 deletions
--- a/README.md
+++ b/README.md
@@ -208,5 +208,11 @@ See our jupyter notebook for [benchmarks](./benchmark/).
 ## Note

 - We implement using witness approach instead of directly calculating the value in circuit. This sometimes allows us to not calculate stuffs like division or exponential which requires larger scale in settings. (If we don't use larger scale in those cases, the accuracy will be very bad)
- For non-linearity function, larger scale leads to larger lookup table, hence bigger circuit size. Can compare between geomean_OG (implemented in traditional way, instead of witness approach) which is the non-linearity function (p bad with larger scale), and mean_OG which is linear function (p fine with larger scale). Hence, we can say that for linearity func like mean, we can use traditional way, while for non-linear func like geomean, we should use witness approach.
 - Dummy data to feed in verifier onnx file needs to have same shape as the private dataset, but can be filled with any value (we just randomize it to be uniform 1-10 with 1 decimal).
+- For Mode function, if there are more than 1 value possible, we just output one of them (the one that first encountered), conforming to the spec of statistics.mode in python lib (https://docs.python.org/3.9/library/statistics.html#statistics.mode)
+
+## Legacy
+
+Not relevant after Commit 48142b5b9ab6b577deadc27886018131008ebad5
+
+- For non-linearity function, larger scale leads to larger lookup table, hence bigger circuit size. Can compare between geomean_OG (implemented in traditional way, instead of witness approach) which is the non-linearity function (p bad with larger scale), and mean_OG which is linear function (p fine with larger scale). Hence, we can say that for linearity func like mean, we can use traditional way, while for non-linear func like geomean, we should use witness approach. However, we just abstract this out by using only the witness approach in our library (which makes sense!)
--- a/examples/correlation/correlation.ipynb
+++ b/examples/correlation/correlation.ipynb
--- a/examples/covariance/covariance.ipynb
+++ b/examples/covariance/covariance.ipynb
--- a/examples/geomean/geomean.ipynb
+++ b/examples/geomean/geomean.ipynb
@@ -76,7 +76,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%run -i ../../zkstats/core.py"
+    "from zkstats.core import create_dummy, verifier_define_calculation, prover_gen_settings, setup, prover_gen_proof, verifier_verify, get_data_commitment_maps"
   ]
  },
  {
@@ -299,7 +299,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
--- a/examples/geomean/geomean_OG.ipynb
+++ b/examples/geomean/geomean_OG.ipynb
--- a/examples/harmomean/harmomean.ipynb
+++ b/examples/harmomean/harmomean.ipynb
@@ -76,7 +76,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%run -i ../../zkstats/core.py"
+    "from zkstats.core import create_dummy, verifier_define_calculation, prover_gen_settings, setup, prover_gen_proof, verifier_verify, get_data_commitment_maps"
   ]
  },
  {
@@ -293,7 +293,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
--- a/examples/mean+median/mean+median.ipynb
+++ b/examples/mean+median/mean+median.ipynb
--- a/examples/mean/mean.ipynb
+++ b/examples/mean/mean.ipynb
--- a/examples/mean/mean_OG.ipynb
+++ b/examples/mean/mean_OG.ipynb
--- a/examples/median/median.ipynb
+++ b/examples/median/median.ipynb
--- a/examples/mode/mode.ipynb
+++ b/examples/mode/mode.ipynb
--- a/examples/pstdev/pstdev.ipynb
+++ b/examples/pstdev/pstdev.ipynb
--- a/examples/pvariance/pvariance.ipynb
+++ b/examples/pvariance/pvariance.ipynb
--- a/examples/stdev/stdev.ipynb
+++ b/examples/stdev/stdev.ipynb
--- a/examples/variance/variance.ipynb
+++ b/examples/variance/variance.ipynb
--- a/zkstats/ops.py
+++ b/zkstats/ops.py
@@ -57,7 +57,6 @@ class Median(Operation):
        self.lower = torch.nn.Parameter(data = torch.tensor(sorted_x[int(len_x/2)-1], dtype = torch.float32), requires_grad=False)
        self.upper = torch.nn.Parameter(data = torch.tensor(sorted_x[int(len_x/2)], dtype = torch.float32), requires_grad=False)

-
    @classmethod
    def create(cls, x: list[torch.Tensor], error: float) -> 'Median':
        return cls(x[0], error)
@@ -85,7 +84,6 @@ class Median(Operation):

        median_in_cons = torch.logical_and(less_cons, more_cons)
        median_out_cons = torch.logical_and(torch.logical_and(bound, bound_avg), torch.logical_and(torch.logical_and(lower_cons, upper_cons), torch.logical_and(lower_exist, upper_exist)))
-
        return torch.where(count_equal==0, median_out_cons, median_in_cons)

 class GeometricMean(Operation):
@@ -120,13 +118,14 @@ def mode_within(data_array: torch.Tensor, error: float) -> torch.Tensor:
    """
    Find the mode (the single most common data point) from the data_array.
    :param data_array: The data array.
-    :param error: The error that allows the data point to be considered as the same.
-       For example, if error = 0.01, then 0.999 and 1.001 are considered as the same.
+    :param error: The error that allows the data point to be considered the same.
+       For example, if error = 0.01, then 0.999 and 1.000 are considered the same.
    """
    max_sum_freq = 0
    mode = data_array[0]
-
-    for check_val in set(data_array):
+    # print("arrrrr: ", data_array)
+    # print("seetttt: ", torch.unique(data_array))
+    for check_val in data_array:
        sum_freq = sum(1 for ele in data_array if abs(ele - check_val) <= abs(error * check_val))
        if sum_freq > max_sum_freq:
            mode = check_val
@@ -138,10 +137,14 @@ class Mode(Operation):
    @classmethod
    def create(cls, x: list[torch.Tensor], error: float) -> 'Mode':
        x_1d = to_1d(x[0])
-        # FIXME: Now hardcode 0.01 to be acceptable range of dataset that
-        # we want to consider it the same, totally different from our result_error
-        # This value doesn't depend on any scale, but on the dataset itself.
-        result = torch.tensor(mode_within(x_1d, 0.01))
+        # FIXME: Mode has no result_error, just num_error which is the 
+        # deviation that two numbers are considered the same. (Make sense because 
+        # if some dataset has all different data, mode will be trivial if this is not the case)
+        # This value doesn't depend on any scale, but on the dataset itself, and the intention 
+        # the evaluator. For example 0.01 means that data is counted as the same within 1% value range.
+
+        # If wanting the strict definition of Mode, can just put this error to be 0
+        result = torch.tensor(mode_within(x_1d, error))
        return cls(result, error)

    def ezkl(self, x: list[torch.Tensor]) -> IsResultPrecise: