Files
concrete/benchmarks/logistic_regression.py
Andrei Stoian b34984d44c fix: benchmarks for regression (#1188)
Wrong variable names for measures in linear regression and missing one in logistic

Closes #1187
2021-12-15 11:04:30 +01:00

222 lines
9.3 KiB
Python

# bench: Full Target: Logistic Regression
# Disable line length warnings as we have a looooong metric...
# flake8: noqa: E501
# pylint: disable=C0301
from copy import deepcopy
from typing import Any, Dict
import numpy as np
from numpy.random import RandomState
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from concrete.quantization import QuantizedArray, QuantizedLinear, QuantizedModule, QuantizedSigmoid
class QuantizedLogisticRegression(QuantizedModule):
"""
Quantized Logistic Regression
Building on top of QuantizedModule, this class will chain together a linear transformation
and an inverse-link function, in this case the logistic function
"""
@staticmethod
def from_sklearn(sklearn_model, calibration_data):
"""Create a Quantized Logistic Regression initialized from a sklearn trained model"""
if sklearn_model.coef_.ndim == 1:
weights = np.expand_dims(sklearn_model.coef_, 1)
else:
weights = sklearn_model.coef_.transpose()
bias = sklearn_model.intercept_
# In our case we have two data dimensions, we the weights precision needs to be 2 bits, as
# for now we need the quantized values to be greater than zero for weights
# Thus, to insure a maximum of 7 bits in the output of the linear transformation, we choose
# 4 bits for the data and the minimum of 1 for the bias
return QuantizedLogisticRegression(4, 2, 1, 6, weights, bias, calibration_data)
def __init__(self, q_bits, w_bits, b_bits, out_bits, weights, bias, calibration_data) -> None:
"""
Create the Logistic regression with different quantization bit precisions:
Quantization Parameters - Number of bits:
q_bits (int): bits for input data, insuring that the number of bits of
the w . x + b operation does not exceed 7 for the calibration data
w_bits (int): bits for weights: in the case of a univariate regression this
can be 1
b_bits (int): bits for bias (this is a single value so a single bit is enough)
out_bits (int): bits for the result of the linear transformation (w.x + b).
In the case of Logistic Regression the result of the linear
transformation is input to a univariate inverse-link function, so
this value can be 7
Other parameters:
weights: a numpy nd-array of weights (Nxd) where d is the data dimensionality
bias: a numpy scalar
calibration_data: a numpy nd-array of data (Nxd)
"""
self.n_bits = out_bits
# We need to calibrate to a sufficiently low number of bits
# so that the output of the Linear layer (w . x + b)
# does not exceed 7 bits
self.q_calibration_data = QuantizedArray(q_bits, calibration_data)
# Quantize the weights and create the quantized linear layer
q_weights = QuantizedArray(w_bits, weights)
q_bias = QuantizedArray(b_bits, bias)
q_layer = QuantizedLinear(out_bits, q_weights, q_bias)
# Store quantized layers
quant_layers_dict: Dict[str, Any] = {}
# Calibrate the linear layer and obtain calibration_data for the next layers
calibration_data = self._calibrate_and_store_layers_activation(
"linear", q_layer, calibration_data, quant_layers_dict
)
# Add the inverse-link for inference.
# This needs to be quantized since it's computed in FHE,
# but we can use 7 bits of output since, in this case,
# the result of the inverse-link is not processed by any further layers
# Seven bits is the maximum precision but this could be lowered to improve speed
# at the possible expense of higher deviance of the regressor
q_logit = QuantizedSigmoid(n_bits=7)
# Now calibrate the inverse-link function with the linear layer's output data
calibration_data = self._calibrate_and_store_layers_activation(
"invlink", q_logit, calibration_data, quant_layers_dict
)
# Finally construct our Module using the quantized layers
super().__init__(quant_layers_dict)
def _calibrate_and_store_layers_activation(
self, name, q_function, calibration_data, quant_layers_dict
):
"""
This function calibrates a layer of a quantized module (e.g. linear, inverse-link,
activation, etc) by looking at the input data, then computes the output of the quantized
version of the layer to be used as input to the following layers
"""
# Calibrate the output of the layer
q_function.calibrate(calibration_data)
# Store the learned quantized layer
quant_layers_dict[name] = q_function
# Create new calibration data (output of the previous layer)
q_calibration_data = QuantizedArray(self.n_bits, calibration_data)
# Dequantize to have the value in clear and ready for next calibration
return q_function(q_calibration_data).dequant()
def quantize_input(self, x):
q_input_arr = deepcopy(self.q_calibration_data)
q_input_arr.update_values(x)
return q_input_arr
def main():
"""Main benchmark function: generate some synthetic data for two class classification,
split train-test, train a sklearn classifier, calibrate and quantize it on the whole dataset
then compile it to FHE. Test the three versions of the classifier on the test set and
report accuracy"""
# Generate some data with a fixed seed
X, y = make_classification(
n_features=2,
n_redundant=0,
n_informative=2,
random_state=2,
n_clusters_per_class=1,
n_samples=100,
)
# Scale the data randomly, fixing seeds for reproductibility
rng = RandomState(2)
X += 2 * rng.uniform(size=X.shape)
# Split it into train/test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
# Train a logistic regression with sklearn on the training set
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
# Calibrate the model for quantization using both training and test data
calib_data = X
q_logreg = QuantizedLogisticRegression.from_sklearn(logreg, calib_data)
# Now, we can compile our model to FHE, taking as possible input set all of our dataset
X_q = q_logreg.quantize_input(X)
# bench: Measure: Compilation Time (ms)
engine = q_logreg.compile(X_q)
# bench: Measure: End
# Start classifier evaluation
# Test the original classifier
y_pred_test = np.asarray(logreg.predict(x_test))
# Now that the model is quantized, predict on the test set
x_test_q = q_logreg.quantize_input(x_test)
q_y_score_test = q_logreg.forward_and_dequant(x_test_q)
q_y_pred_test = (q_y_score_test > 0.5).astype(np.int32)
non_homomorphic_correct = 0
homomorphic_correct = 0
# Track the samples that are wrongly classified due to quantization issues
q_wrong_predictions = np.zeros((0, 2), dtype=X.dtype)
# Predict the FHE quantized classifier probabilities on the test set.
# Compute FHE quantized accuracy, clear-quantized accuracy and
# keep track of samples wrongly classified due to quantization
for i, x_i in enumerate(tqdm(x_test_q.qvalues)):
y_i = y_test[i]
fhe_in_sample = np.expand_dims(x_i, 1).transpose([1, 0]).astype(np.uint8)
# bench: Measure: Evaluation Time (ms)
q_pred_fhe = engine.run(fhe_in_sample)
# bench: Measure: End
y_score_fhe = q_logreg.dequantize_output(q_pred_fhe)
homomorphic_prediction = (y_score_fhe > 0.5).astype(np.int32)
non_homomorphic_prediction = q_y_pred_test[i]
if non_homomorphic_prediction == y_i:
non_homomorphic_correct += 1
elif y_pred_test[i] == y_i:
# If this was a correct prediction with the clear-sklearn classifier
q_wrong_predictions = np.vstack((q_wrong_predictions, x_test[i, :]))
if homomorphic_prediction == y_i:
homomorphic_correct += 1
# Aggregate accuracies for all the versions of the classifier
sklearn_acc = np.sum(y_pred_test == y_test) / len(y_test) * 100
non_homomorphic_accuracy = (non_homomorphic_correct / len(y_test)) * 100
homomorphic_accuracy = (homomorphic_correct / len(y_test)) * 100
difference = abs(homomorphic_accuracy - non_homomorphic_accuracy)
print()
print(f"Sklearn accuracy: {sklearn_acc:.4f}")
print(f"Non Homomorphic Accuracy: {non_homomorphic_accuracy:.4f}")
print(f"Homomorphic Accuracy: {homomorphic_accuracy:.4f}")
print(f"Difference Percentage: {difference:.2f}%")
# bench: Measure: Sklearn accuracy = sklearn_acc
# bench: Measure: Non Homomorphic Accuracy = non_homomorphic_accuracy
# bench: Measure: Homomorphic Accuracy = homomorphic_accuracy
# bench: Measure: Accuracy Difference Between Homomorphic and Non Homomorphic Implementation (%) = difference
# bench: Alert: Accuracy Difference Between Homomorphic and Non Homomorphic Implementation (%) > 2
if __name__ == "__main__":
main()