mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
Wrong variable names for measures in linear regression and missing one in logistic Closes #1187
222 lines
9.3 KiB
Python
222 lines
9.3 KiB
Python
# bench: Full Target: Logistic Regression
|
|
|
|
# Disable line length warnings as we have a looooong metric...
|
|
# flake8: noqa: E501
|
|
# pylint: disable=C0301
|
|
|
|
from copy import deepcopy
|
|
from typing import Any, Dict
|
|
|
|
import numpy as np
|
|
from numpy.random import RandomState
|
|
from sklearn.datasets import make_classification
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.model_selection import train_test_split
|
|
from tqdm import tqdm
|
|
|
|
from concrete.quantization import QuantizedArray, QuantizedLinear, QuantizedModule, QuantizedSigmoid
|
|
|
|
|
|
class QuantizedLogisticRegression(QuantizedModule):
|
|
"""
|
|
Quantized Logistic Regression
|
|
Building on top of QuantizedModule, this class will chain together a linear transformation
|
|
and an inverse-link function, in this case the logistic function
|
|
"""
|
|
|
|
@staticmethod
|
|
def from_sklearn(sklearn_model, calibration_data):
|
|
"""Create a Quantized Logistic Regression initialized from a sklearn trained model"""
|
|
if sklearn_model.coef_.ndim == 1:
|
|
weights = np.expand_dims(sklearn_model.coef_, 1)
|
|
else:
|
|
weights = sklearn_model.coef_.transpose()
|
|
|
|
bias = sklearn_model.intercept_
|
|
|
|
# In our case we have two data dimensions, we the weights precision needs to be 2 bits, as
|
|
# for now we need the quantized values to be greater than zero for weights
|
|
# Thus, to insure a maximum of 7 bits in the output of the linear transformation, we choose
|
|
# 4 bits for the data and the minimum of 1 for the bias
|
|
return QuantizedLogisticRegression(4, 2, 1, 6, weights, bias, calibration_data)
|
|
|
|
def __init__(self, q_bits, w_bits, b_bits, out_bits, weights, bias, calibration_data) -> None:
|
|
"""
|
|
Create the Logistic regression with different quantization bit precisions:
|
|
|
|
Quantization Parameters - Number of bits:
|
|
q_bits (int): bits for input data, insuring that the number of bits of
|
|
the w . x + b operation does not exceed 7 for the calibration data
|
|
w_bits (int): bits for weights: in the case of a univariate regression this
|
|
can be 1
|
|
b_bits (int): bits for bias (this is a single value so a single bit is enough)
|
|
out_bits (int): bits for the result of the linear transformation (w.x + b).
|
|
In the case of Logistic Regression the result of the linear
|
|
transformation is input to a univariate inverse-link function, so
|
|
this value can be 7
|
|
|
|
Other parameters:
|
|
weights: a numpy nd-array of weights (Nxd) where d is the data dimensionality
|
|
bias: a numpy scalar
|
|
calibration_data: a numpy nd-array of data (Nxd)
|
|
"""
|
|
self.n_bits = out_bits
|
|
|
|
# We need to calibrate to a sufficiently low number of bits
|
|
# so that the output of the Linear layer (w . x + b)
|
|
# does not exceed 7 bits
|
|
self.q_calibration_data = QuantizedArray(q_bits, calibration_data)
|
|
|
|
# Quantize the weights and create the quantized linear layer
|
|
q_weights = QuantizedArray(w_bits, weights)
|
|
q_bias = QuantizedArray(b_bits, bias)
|
|
q_layer = QuantizedLinear(out_bits, q_weights, q_bias)
|
|
|
|
# Store quantized layers
|
|
quant_layers_dict: Dict[str, Any] = {}
|
|
|
|
# Calibrate the linear layer and obtain calibration_data for the next layers
|
|
calibration_data = self._calibrate_and_store_layers_activation(
|
|
"linear", q_layer, calibration_data, quant_layers_dict
|
|
)
|
|
|
|
# Add the inverse-link for inference.
|
|
# This needs to be quantized since it's computed in FHE,
|
|
# but we can use 7 bits of output since, in this case,
|
|
# the result of the inverse-link is not processed by any further layers
|
|
# Seven bits is the maximum precision but this could be lowered to improve speed
|
|
# at the possible expense of higher deviance of the regressor
|
|
q_logit = QuantizedSigmoid(n_bits=7)
|
|
|
|
# Now calibrate the inverse-link function with the linear layer's output data
|
|
calibration_data = self._calibrate_and_store_layers_activation(
|
|
"invlink", q_logit, calibration_data, quant_layers_dict
|
|
)
|
|
|
|
# Finally construct our Module using the quantized layers
|
|
super().__init__(quant_layers_dict)
|
|
|
|
def _calibrate_and_store_layers_activation(
|
|
self, name, q_function, calibration_data, quant_layers_dict
|
|
):
|
|
"""
|
|
This function calibrates a layer of a quantized module (e.g. linear, inverse-link,
|
|
activation, etc) by looking at the input data, then computes the output of the quantized
|
|
version of the layer to be used as input to the following layers
|
|
"""
|
|
|
|
# Calibrate the output of the layer
|
|
q_function.calibrate(calibration_data)
|
|
# Store the learned quantized layer
|
|
quant_layers_dict[name] = q_function
|
|
# Create new calibration data (output of the previous layer)
|
|
q_calibration_data = QuantizedArray(self.n_bits, calibration_data)
|
|
# Dequantize to have the value in clear and ready for next calibration
|
|
return q_function(q_calibration_data).dequant()
|
|
|
|
def quantize_input(self, x):
|
|
q_input_arr = deepcopy(self.q_calibration_data)
|
|
q_input_arr.update_values(x)
|
|
return q_input_arr
|
|
|
|
|
|
def main():
|
|
"""Main benchmark function: generate some synthetic data for two class classification,
|
|
split train-test, train a sklearn classifier, calibrate and quantize it on the whole dataset
|
|
then compile it to FHE. Test the three versions of the classifier on the test set and
|
|
report accuracy"""
|
|
|
|
# Generate some data with a fixed seed
|
|
X, y = make_classification(
|
|
n_features=2,
|
|
n_redundant=0,
|
|
n_informative=2,
|
|
random_state=2,
|
|
n_clusters_per_class=1,
|
|
n_samples=100,
|
|
)
|
|
|
|
# Scale the data randomly, fixing seeds for reproductibility
|
|
rng = RandomState(2)
|
|
X += 2 * rng.uniform(size=X.shape)
|
|
|
|
# Split it into train/test
|
|
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
|
|
|
|
# Train a logistic regression with sklearn on the training set
|
|
logreg = LogisticRegression()
|
|
logreg.fit(x_train, y_train)
|
|
|
|
# Calibrate the model for quantization using both training and test data
|
|
calib_data = X
|
|
q_logreg = QuantizedLogisticRegression.from_sklearn(logreg, calib_data)
|
|
|
|
# Now, we can compile our model to FHE, taking as possible input set all of our dataset
|
|
X_q = q_logreg.quantize_input(X)
|
|
|
|
# bench: Measure: Compilation Time (ms)
|
|
engine = q_logreg.compile(X_q)
|
|
# bench: Measure: End
|
|
|
|
# Start classifier evaluation
|
|
|
|
# Test the original classifier
|
|
y_pred_test = np.asarray(logreg.predict(x_test))
|
|
|
|
# Now that the model is quantized, predict on the test set
|
|
x_test_q = q_logreg.quantize_input(x_test)
|
|
q_y_score_test = q_logreg.forward_and_dequant(x_test_q)
|
|
q_y_pred_test = (q_y_score_test > 0.5).astype(np.int32)
|
|
|
|
non_homomorphic_correct = 0
|
|
homomorphic_correct = 0
|
|
|
|
# Track the samples that are wrongly classified due to quantization issues
|
|
q_wrong_predictions = np.zeros((0, 2), dtype=X.dtype)
|
|
|
|
# Predict the FHE quantized classifier probabilities on the test set.
|
|
# Compute FHE quantized accuracy, clear-quantized accuracy and
|
|
# keep track of samples wrongly classified due to quantization
|
|
for i, x_i in enumerate(tqdm(x_test_q.qvalues)):
|
|
y_i = y_test[i]
|
|
|
|
fhe_in_sample = np.expand_dims(x_i, 1).transpose([1, 0]).astype(np.uint8)
|
|
|
|
# bench: Measure: Evaluation Time (ms)
|
|
q_pred_fhe = engine.run(fhe_in_sample)
|
|
# bench: Measure: End
|
|
y_score_fhe = q_logreg.dequantize_output(q_pred_fhe)
|
|
homomorphic_prediction = (y_score_fhe > 0.5).astype(np.int32)
|
|
|
|
non_homomorphic_prediction = q_y_pred_test[i]
|
|
if non_homomorphic_prediction == y_i:
|
|
non_homomorphic_correct += 1
|
|
elif y_pred_test[i] == y_i:
|
|
# If this was a correct prediction with the clear-sklearn classifier
|
|
q_wrong_predictions = np.vstack((q_wrong_predictions, x_test[i, :]))
|
|
|
|
if homomorphic_prediction == y_i:
|
|
homomorphic_correct += 1
|
|
|
|
# Aggregate accuracies for all the versions of the classifier
|
|
sklearn_acc = np.sum(y_pred_test == y_test) / len(y_test) * 100
|
|
non_homomorphic_accuracy = (non_homomorphic_correct / len(y_test)) * 100
|
|
homomorphic_accuracy = (homomorphic_correct / len(y_test)) * 100
|
|
difference = abs(homomorphic_accuracy - non_homomorphic_accuracy)
|
|
|
|
print()
|
|
print(f"Sklearn accuracy: {sklearn_acc:.4f}")
|
|
print(f"Non Homomorphic Accuracy: {non_homomorphic_accuracy:.4f}")
|
|
print(f"Homomorphic Accuracy: {homomorphic_accuracy:.4f}")
|
|
print(f"Difference Percentage: {difference:.2f}%")
|
|
|
|
# bench: Measure: Sklearn accuracy = sklearn_acc
|
|
# bench: Measure: Non Homomorphic Accuracy = non_homomorphic_accuracy
|
|
# bench: Measure: Homomorphic Accuracy = homomorphic_accuracy
|
|
# bench: Measure: Accuracy Difference Between Homomorphic and Non Homomorphic Implementation (%) = difference
|
|
# bench: Alert: Accuracy Difference Between Homomorphic and Non Homomorphic Implementation (%) > 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|