Files
MP-SPDZ/Compiler/ml.py
2022-11-09 11:22:18 +11:00

2924 lines
113 KiB
Python

"""
This module contains machine learning functionality. It is work in
progress, so you must expect things to change. The only tested
functionality for training is using consecutive layers.
This includes logistic regression. It can be run as
follows::
sgd = ml.SGD([ml.Dense(n_examples, n_features, 1),
ml.Output(n_examples, approx=True)], n_epochs,
report_loss=True)
sgd.layers[0].X.input_from(0)
sgd.layers[1].Y.input_from(1)
sgd.reset()
sgd.run()
This loads measurements from party 0 and labels (0/1) from party
1. After running, the model is stored in :py:obj:`sgd.layers[0].W` and
:py:obj:`sgd.layers[0].b`. The :py:obj:`approx` parameter determines
whether to use an approximate sigmoid function. Setting it to 5 uses
a five-piece approximation instead of a three-piece one.
A simple network for MNIST using two dense layers can be trained as
follows::
sgd = ml.SGD([ml.Dense(60000, 784, 128, activation='relu'),
ml.Dense(60000, 128, 10),
ml.MultiOutput(60000, 10)], n_epochs,
report_loss=True)
sgd.layers[0].X.input_from(0)
sgd.layers[1].Y.input_from(1)
sgd.reset()
sgd.run()
See `this repository <https://github.com/csiro-mlai/mnist-mpc>`_
for scripts importing MNIST training data and further examples.
Inference can be run as follows::
data = sfix.Matrix(n_test, n_features)
data.input_from(0)
res = sgd.eval(data)
print_ln('Results: %s', [x.reveal() for x in res])
For inference/classification, this module offers the layers necessary
for neural networks such as DenseNet, ResNet, and SqueezeNet. A
minimal example using input from player 0 and model from player 1
looks as follows::
graph = Optimizer()
graph.layers = layers
layers[0].X.input_from(0)
for layer in layers:
layer.input_from(1)
graph.forward(1)
res = layers[-1].Y
See the `readme <https://github.com/data61/MP-SPDZ/#tensorflow-inference>`_ for
an example of how to run MP-SPDZ on TensorFlow graphs.
"""
import math
import re
from Compiler import mpc_math, util
from Compiler.types import *
from Compiler.types import _unreduced_squant
from Compiler.library import *
from Compiler.util import is_zero, tree_reduce
from Compiler.comparison import CarryOutRawLE
from Compiler.GC.types import sbitint
from functools import reduce
def log_e(x):
return mpc_math.log_fx(x, math.e)
use_mux = False
def exp(x):
if use_mux:
return mpc_math.mux_exp(math.e, x)
else:
return mpc_math.pow_fx(math.e, x)
def get_limit(x):
exp_limit = 2 ** (x.k - x.f - 1)
return math.log(exp_limit)
def sanitize(x, raw, lower, upper):
limit = get_limit(x)
res = (x > limit).if_else(upper, raw)
return (x < -limit).if_else(lower, res)
def sigmoid(x):
""" Sigmoid function.
:param x: sfix """
return sigmoid_from_e_x(x, exp(-x))
def sigmoid_from_e_x(x, e_x):
return sanitize(x, 1 / (1 + e_x), 0, 1)
def sigmoid_prime(x):
""" Sigmoid derivative.
:param x: sfix """
sx = sigmoid(x)
return sx * (1 - sx)
@vectorize
def approx_sigmoid(x, n=3):
""" Piece-wise approximate sigmoid as in
`Hong et al. <https://arxiv.org/abs/2002.04344>`_
:param x: input
:param n: number of pieces, 3 (default) or 5
"""
if n == 5:
cuts = [-5, -2.5, 2.5, 5]
le = [0] + [x <= cut for cut in cuts] + [1]
select = [le[i + 1] - le[i] for i in range(5)]
outputs = [cfix(10 ** -4),
0.02776 * x + 0.145,
0.17 * x + 0.5,
0.02776 * x + 0.85498,
cfix(1 - 10 ** -4)]
return sum(a * b for a, b in zip(select, outputs))
else:
a = x < -0.5
b = x > 0.5
return a.if_else(0, b.if_else(1, 0.5 + x))
def lse_0_from_e_x(x, e_x):
return sanitize(-x, log_e(1 + e_x), x + 2 ** -x.f, 0)
def lse_0(x):
return lse_0_from_e_x(x, exp(x))
def approx_lse_0(x, n=3):
assert n != 5
a = x < -0.5
b = x > 0.5
return a.if_else(0, b.if_else(x, 0.5 * (x + 0.5) ** 2)) - x
def relu_prime(x):
""" ReLU derivative. """
return (0 <= x)
def relu(x):
""" ReLU function (maximum of input and zero). """
return (0 < x).if_else(x, 0)
def argmax(x):
""" Compute index of maximum element.
:param x: iterable
:returns: sint or 0 if :py:obj:`x` has length 1
"""
def op(a, b):
comp = (a[1] > b[1])
return comp.if_else(a[0], b[0]), comp.if_else(a[1], b[1])
return tree_reduce(op, enumerate(x))[0]
def softmax(x):
""" Softmax.
:param x: vector or list of sfix
:returns: sfix vector
"""
return softmax_from_exp(exp_for_softmax(x)[0])
def exp_for_softmax(x):
m = util.max(x) - get_limit(x[0]) + math.log(len(x))
mv = m.expand_to_vector(len(x))
try:
x = x.get_vector()
except AttributeError:
x = sfix(x)
if use_mux:
return exp(x - mv), m
else:
return (x - mv > -get_limit(x)).if_else(exp(x - mv), 0), m
def softmax_from_exp(x):
return x / sum(x)
report_progress = False
def progress(x):
if report_progress:
print_ln(x)
time()
def set_n_threads(n_threads):
Layer.n_threads = n_threads
Optimizer.n_threads = n_threads
def _no_mem_warnings(function):
def wrapper(*args, **kwargs):
get_program().warn_about_mem.append(False)
res = function(*args, **kwargs)
get_program().warn_about_mem.pop()
return res
copy_doc(wrapper, function)
return wrapper
class Tensor(MultiArray):
def __init__(self, *args, **kwargs):
kwargs['alloc'] = False
super(Tensor, self).__init__(*args, **kwargs)
def input_from(self, *args, **kwargs):
self.alloc()
super(Tensor, self).input_from(*args, **kwargs)
def __getitem__(self, *args):
self.alloc()
return super(Tensor, self).__getitem__(*args)
def assign_vector(self, *args):
self.alloc()
return super(Tensor, self).assign_vector(*args)
def assign_vector_by_indices(self, *args):
self.alloc()
return super(Tensor, self).assign_vector_by_indices(*args)
class Layer:
n_threads = 1
inputs = []
input_bias = True
thetas = lambda self: ()
debug_output = False
back_batch_size = 128
print_random_update = False
@property
def shape(self):
return list(self._Y.sizes)
@property
def X(self):
self._X.alloc()
return self._X
@X.setter
def X(self, value):
self._X = value
@property
def Y(self):
self._Y.alloc()
return self._Y
@Y.setter
def Y(self, value):
self._Y = value
def forward(self, batch=None, training=None):
if batch is None:
batch = Array.create_from(regint(0))
self._forward(batch)
def __str__(self):
return type(self).__name__ + str(self._Y.sizes)
def __repr__(self):
return '%s(%s)' % (type(self).__name__, self.Y.sizes)
class NoVariableLayer(Layer):
input_from = lambda *args, **kwargs: None
output_weights = lambda *args: None
nablas = lambda self: ()
reset = lambda self: None
class Output(NoVariableLayer):
""" Fixed-point logistic regression output layer.
:param N: number of examples
:param approx: :py:obj:`False` (default) or parameter for :py:obj:`approx_sigmoid`
"""
n_outputs = 2
@classmethod
def from_args(cls, N, program):
res = cls(N, approx='approx' in program.args)
res.compute_loss = not 'no_loss' in program.args
return res
def __init__(self, N, debug=False, approx=False):
self.N = N
self.X = sfix.Array(N)
self.Y = sfix.Array(N)
self.nabla_X = sfix.Array(N)
self.l = MemValue(sfix(-1))
self.e_x = sfix.Array(N)
self.debug = debug
self.weights = None
self.approx = approx
self.compute_loss = True
self.d_out = 1
def divisor(self, divisor, size):
return cfix(1.0 / divisor, size=size)
def _forward(self, batch):
if self.approx == 5:
self.l.write(999)
return
N = len(batch)
lse = sfix.Array(N)
@multithread(self.n_threads, N)
def _(base, size):
x = self.X.get_vector(base, size)
y = self.Y.get(batch.get_vector(base, size))
if self.approx:
if self.compute_loss:
lse.assign(approx_lse_0(x, self.approx) + x * (1 - y), base)
return
e_x = exp(-x)
self.e_x.assign(e_x, base)
if self.compute_loss:
lse.assign(lse_0_from_e_x(-x, e_x) + x * (1 - y), base)
self.l.write(sum(lse) * \
self.divisor(N, 1))
def eval(self, size, base=0, top=False):
assert not top
if self.approx:
return approx_sigmoid(self.X.get_vector(base, size), self.approx)
else:
return sigmoid_from_e_x(self.X.get_vector(base, size),
self.e_x.get_vector(base, size))
def backward(self, batch):
N = len(batch)
@multithread(self.n_threads, N)
def _(base, size):
diff = self.eval(size, base) - \
self.Y.get(batch.get_vector(base, size))
if self.weights is not None:
assert N == len(self.weights)
diff *= self.weights.get_vector(base, size)
assert self.weight_total == N
self.nabla_X.assign(diff, base)
# @for_range_opt(len(diff))
# def _(i):
# self.nabla_X[i] = self.nabla_X[i] * self.weights[i]
if self.debug_output:
print_ln('sigmoid X %s', self.X.reveal_nested())
print_ln('sigmoid nabla %s', self.nabla_X.reveal_nested())
print_ln('batch %s', batch.reveal_nested())
def set_weights(self, weights):
assert sfix.f == cfix.f
self.weights = cfix.Array(len(weights))
self.weights.assign(weights)
self.weight_total = sum(weights)
def average_loss(self, N):
return self.l.reveal()
def reveal_correctness(self, n=None, Y=None, debug=False):
if n is None:
n = self.X.sizes[0]
if Y is None:
Y = self.Y
n_correct = MemValue(0)
n_printed = MemValue(0)
@for_range_opt(n)
def _(i):
truth = Y[i].reveal()
b = self.X[i].reveal()
if debug:
nabla = self.nabla_X[i].reveal()
guess = b > 0
correct = truth == guess
n_correct.iadd(correct)
if debug:
to_print = (1 - correct) * (n_printed < 10)
n_printed.iadd(to_print)
print_ln_if(to_print, '%s: %s %s %s %s',
i, truth, guess, b, nabla)
return n_correct
class MultiOutputBase(NoVariableLayer):
def __init__(self, N, d_out, approx=False, debug=False):
self.X = sfix.Matrix(N, d_out)
self.Y = sint.Matrix(N, d_out)
self.nabla_X = sfix.Matrix(N, d_out)
self.l = MemValue(sfix(-1))
self.losses = sfix.Array(N)
self.approx = None
self.N = N
self.d_out = d_out
self.compute_loss = True
def eval(self, N):
d_out = self.X.sizes[1]
res = sfix.Matrix(N, d_out)
res.assign_vector(self.X.get_part_vector(0, N))
return res
def average_loss(self, N):
return sum(self.losses.get_vector(0, N)).reveal() / N
def reveal_correctness(self, n=None, Y=None, debug=False):
if n is None:
n = self.X.sizes[0]
if Y is None:
Y = self.Y
n_printed = MemValue(0)
assert n <= len(self.X)
assert n <= len(Y)
Y.address = MemValue.if_necessary(Y.address)
@map_sum(None if debug else self.n_threads, None, n, 1, regint)
def _(i):
a = Y[i].reveal_list()
b = self.X[i].reveal_list()
if debug:
loss = self.losses[i].reveal()
exp = self.get_extra_debugging(i)
nabla = self.nabla_X[i].reveal_list()
truth = argmax(a)
guess = argmax(b)
correct = truth == guess
if debug:
to_print = (1 - correct) * (n_printed < 10)
n_printed.iadd(to_print)
print_ln_if(to_print, '%s: %s %s %s %s %s %s',
i, truth, guess, loss, b, exp, nabla)
return correct
return _()
@property
def n_outputs(self):
return self.d_out
def get_extra_debugging(self, i):
return ''
@staticmethod
def from_args(program, N, n_output):
if 'relu_out' in program.args:
res = ReluMultiOutput(N, n_output)
else:
res = MultiOutput(N, n_output, approx='approx' in program.args)
res.cheaper_loss = 'mse' in program.args
res.compute_loss = not 'no_loss' in program.args
for arg in program.args:
m = re.match('approx=(.*)', arg)
if m:
res.approx = float(m.group(1))
return res
class MultiOutput(MultiOutputBase):
"""
Output layer for multi-class classification with softmax and cross entropy.
:param N: number of examples
:param d_out: number of classes
:param approx: use ReLU division instead of softmax for the loss
"""
def __init__(self, N, d_out, approx=False, debug=False):
MultiOutputBase.__init__(self, N, d_out)
self.exp = sfix.Matrix(N, d_out)
self.approx = approx
self.positives = sint.Matrix(N, d_out)
self.relus = sfix.Matrix(N, d_out)
self.cheaper_loss = False
self.debug = debug
self.true_X = sfix.Array(N)
def __repr__(self):
return '%s(%s, %s, approx=%s)' % \
(type(self).__name__, self.N, self.d_out, self.approx)
def _forward(self, batch):
N = len(batch)
d_out = self.X.sizes[1]
tmp = self.losses
@for_range_opt_multithread(self.n_threads, N)
def _(i):
if self.approx:
if self.cheaper_loss or isinstance(self.approx, float):
limit = 0
else:
limit = 0.1
positives = self.X[i].get_vector() > limit
relus = positives.if_else(self.X[i].get_vector(), 0)
self.positives[i].assign_vector(positives)
self.relus[i].assign_vector(relus)
if self.compute_loss:
if self.cheaper_loss:
s = sum(relus)
tmp[i] = sum((self.Y[batch[i]][j] * s - relus[j]) ** 2
for j in range(d_out)) / s ** 2 * 0.5
else:
div = relus / sum(relus).expand_to_vector(d_out)
self.losses[i] = -sfix.dot_product(
self.Y[batch[i]].get_vector(), log_e(div))
else:
e, m = exp_for_softmax(self.X[i])
self.exp[i].assign_vector(e)
if self.compute_loss:
true_X = sfix.dot_product(self.Y[batch[i]], self.X[i])
tmp[i] = m + log_e(sum(e)) - true_X
self.true_X[i] = true_X
self.l.write(sum(tmp.get_vector(0, N)) / N)
def eval(self, N, top=False):
d_out = self.X.sizes[1]
if top:
res = sint.Array(N)
@for_range_opt_multithread(self.n_threads, N)
def _(i):
res[i] = argmax(self.X[i])
return res
res = sfix.Matrix(N, d_out)
if self.approx:
@for_range_opt_multithread(self.n_threads, N)
def _(i):
relus = (self.X[i].get_vector() > 0).if_else(
self.X[i].get_vector(), 0)
res[i].assign_vector(relus / sum(relus).expand_to_vector(d_out))
return res
@for_range_opt_multithread(self.n_threads, N)
def _(i):
x = self.X[i].get_vector() - \
util.max(self.X[i].get_vector()).expand_to_vector(d_out)
e = exp(x)
res[i].assign_vector(e / sum(e).expand_to_vector(d_out))
return res
def backward(self, batch):
d_out = self.X.sizes[1]
if self.approx:
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
if self.cheaper_loss:
s = sum(self.relus[i])
ss = s * s * s
inv = 1 / ss
@for_range_opt(d_out)
def _(j):
res = 0
for k in range(d_out):
relu = self.relus[i][k]
summand = relu - self.Y[batch[i]][k] * s
summand *= (sfix.from_sint(j == k) - relu)
res += summand
fallback = -self.Y[batch[i]][j]
res *= inv
self.nabla_X[i][j] = self.positives[i][j].if_else(res, fallback)
return
relus = self.relus[i].get_vector()
if isinstance(self.approx, float):
relus += self.approx
positives = self.positives[i].get_vector()
inv = (1 / sum(relus)).expand_to_vector(d_out)
truths = self.Y[batch[i]].get_vector()
raw = truths / relus - inv
self.nabla_X[i] = -positives.if_else(raw, truths)
self.maybe_debug_backward(batch)
return
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
div = softmax_from_exp(self.exp[i])
self.nabla_X[i][:] = -self.Y[batch[i]][:] + div
self.maybe_debug_backward(batch)
def maybe_debug_backward(self, batch):
if self.debug:
@for_range(len(batch))
def _(i):
check = 0
for j in range(self.X.sizes[1]):
to_check = self.nabla_X[i][j].reveal()
check += (to_check > len(batch)) + (to_check < -len(batch))
print_ln_if(check, 'X %s', self.X[i].reveal_nested())
print_ln_if(check, 'exp %s', self.exp[i].reveal_nested())
print_ln_if(check, 'nabla X %s',
self.nabla_X[i].reveal_nested())
def get_extra_debugging(self, i):
if self.approx:
return self.relus[i].reveal_list()
else:
return self.exp[i].reveal_list()
class ReluMultiOutput(MultiOutputBase):
"""
Output layer for multi-class classification with back-propagation
based on ReLU division.
:param N: number of examples
:param d_out: number of classes
"""
def forward(self, batch, training=None):
self.l.write(999)
def backward(self, batch):
N = len(batch)
d_out = self.X.sizes[1]
relus = sfix.Matrix(N, d_out)
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
positives = self.X[i].get_vector() > 0
relus = positives.if_else(self.X[i].get_vector(), 0)
s = sum(relus)
inv = 1 / s
prod = relus * inv
res = prod - self.Y[batch[i]].get_vector()
self.nabla_X[i].assign_vector(res)
class DenseBase(Layer):
thetas = lambda self: (self.W, self.b)
nablas = lambda self: (self.nabla_W, self.nabla_b)
def output_weights(self):
self.W.print_reveal_nested()
print_ln('%s', self.b.reveal_nested())
def backward_params(self, f_schur_Y, batch):
N = len(batch)
tmp = Matrix(self.d_in, self.d_out, unreduced_sfix)
A = sfix.Matrix(N, self.d_out, address=f_schur_Y.address)
B = sfix.Matrix(self.N, self.d_in, address=self.X.address)
@multithread(self.n_threads, self.d_in)
def _(base, size):
mp = B.direct_trans_mul(A, reduce=False,
indices=(regint.inc(size, base),
batch.get_vector(),
regint.inc(N),
regint.inc(self.d_out)))
tmp.assign_part_vector(mp, base)
progress('nabla W (matmul)')
@multithread(self.n_threads, self.d_in * self.d_out,
max_size=get_program().budget)
def _(base, size):
self.nabla_W.assign_vector(
tmp.get_vector(base, size).reduce_after_mul(), base=base)
if self.print_random_update:
print_ln('backward %s', self)
i = regint.get_random(64) % self.d_in
j = regint.get_random(64) % self.d_out
print_ln('%s at (%s, %s): before=%s after=%s A=%s B=%s',
str(self.nabla_W), i, j, tmp[i][j].v.reveal(),
self.nabla_W[i][j].reveal(),
A.get_column(j).reveal(),
B.get_column_by_row_indices(
batch.get_vector(), i).reveal())
print_ln('batch=%s B=%s', batch,
[self.X[bi][0][i].reveal() for bi in batch])
progress('nabla W')
self.nabla_b.assign_vector(sum(sum(f_schur_Y[k][j].get_vector()
for k in range(N))
for j in range(self.d)))
progress('nabla b')
if self.debug_output:
print_ln('dense nabla Y %s', self.nabla_Y.reveal_nested())
print_ln('dense W %s', self.W.reveal_nested())
print_ln('dense nabla X %s', self.nabla_X.reveal_nested())
if self.debug:
limit = N * self.debug
@for_range_opt(self.d_in)
def _(i):
@for_range_opt(self.d_out)
def _(j):
to_check = self.nabla_W[i][j].reveal()
check = sum(to_check > limit) + sum(to_check < -limit)
@if_(check)
def _():
print_ln('nabla W %s %s %s: %s', i, j, self.W.sizes, to_check)
print_ln('Y %s', [f_schur_Y[k][0][j].reveal()
for k in range(N)])
print_ln('X %s', [self.X[k][0][i].reveal()
for k in range(N)])
@for_range_opt(self.d_out)
def _(j):
to_check = self.nabla_b[j].reveal()
check = sum(to_check > limit) + sum(to_check < -limit)
@if_(check)
def _():
print_ln('nabla b %s %s: %s', j, len(self.b), to_check)
print_ln('Y %s', [f_schur_Y[k][0][j].reveal()
for k in range(N)])
@for_range_opt(len(batch))
def _(i):
to_check = self.nabla_X[i].get_vector().reveal()
check = sum(to_check > limit) + sum(to_check < -limit)
@if_(check)
def _():
print_ln('X %s %s', i, self.X[i].reveal_nested())
print_ln('Y %s %s', i, f_schur_Y[i].reveal_nested())
class Dense(DenseBase):
""" Fixed-point dense (matrix multiplication) layer.
:param N: number of examples
:param d_in: input dimension
:param d_out: output dimension
"""
def __init__(self, N, d_in, d_out, d=1, activation='id', debug=False):
if activation == 'id':
self.activation_layer = None
elif activation == 'relu':
self.activation_layer = Relu([N, d, d_out])
elif activation == 'square':
self.activation_layer = Square([N, d, d_out])
else:
raise CompilerError('activation not supported: %s', activation)
self.N = N
self.d_in = d_in
self.d_out = d_out
self.d = d
self.activation = activation
self.X = MultiArray([N, d, d_in], sfix)
self.Y = MultiArray([N, d, d_out], sfix)
self.W = Tensor([d_in, d_out], sfix)
self.b = sfix.Array(d_out)
back_N = min(N, self.back_batch_size)
self.nabla_Y = MultiArray([back_N, d, d_out], sfix)
self.nabla_X = MultiArray([back_N, d, d_in], sfix)
self.nabla_W = sfix.Matrix(d_in, d_out)
self.nabla_b = sfix.Array(d_out)
self.debug = debug
l = self.activation_layer
if l:
self.f_input = l.X
l.Y = self.Y
l.nabla_Y = self.nabla_Y
else:
self.f_input = self.Y
def __repr__(self):
return '%s(%s, %s, %s, activation=%s)' % \
(type(self).__name__, self.N, self.d_in,
self.d_out, repr(self.activation))
def reset(self):
d_in = self.d_in
d_out = self.d_out
r = math.sqrt(6.0 / (d_in + d_out))
print('Initializing dense weights in [%f,%f]' % (-r, r))
self.W.randomize(-r, r)
self.b.assign_all(0)
def input_from(self, player, raw=False):
self.W.input_from(player, raw=raw)
if self.input_bias:
self.b.input_from(player, raw=raw)
def compute_f_input(self, batch):
N = len(batch)
assert self.d == 1
if self.input_bias:
prod = MultiArray([N, self.d, self.d_out], sfix)
else:
prod = self.f_input
max_size = program.Program.prog.budget // self.d_out
@multithread(self.n_threads, N, max_size)
def _(base, size):
X_sub = sfix.Matrix(self.N, self.d_in, address=self.X.address)
prod.assign_part_vector(
X_sub.direct_mul(self.W, indices=(
batch.get_vector(base, size), regint.inc(self.d_in),
regint.inc(self.d_in), regint.inc(self.d_out))), base)
if self.input_bias:
if self.d_out == 1:
@multithread(self.n_threads, N)
def _(base, size):
v = prod.get_vector(base, size) + self.b.expand_to_vector(0, size)
self.f_input.assign_vector(v, base)
else:
@for_range_multithread(self.n_threads, 100, N)
def _(i):
v = prod[i].get_vector() + self.b.get_vector()
self.f_input[i].assign_vector(v)
progress('f input')
def _forward(self, batch=None):
if batch is None:
batch = regint.Array(self.N)
batch.assign(regint.inc(self.N))
self.compute_f_input(batch=batch)
if self.activation_layer:
self.activation_layer.forward(batch)
if self.debug_output:
print_ln('dense X %s', self.X.reveal_nested())
print_ln('dense W %s', self.W.reveal_nested())
print_ln('dense b %s', self.b.reveal_nested())
print_ln('dense Y %s', self.Y.reveal_nested())
if self.debug:
limit = self.debug
@for_range_opt(len(batch))
def _(i):
@for_range_opt(self.d_out)
def _(j):
to_check = self.Y[i][0][j].reveal()
check = to_check > limit
@if_(check)
def _():
print_ln('dense Y %s %s %s %s', i, j, self.W.sizes, to_check)
print_ln('X %s', self.X[i].reveal_nested())
print_ln('W %s',
[self.W[k][j].reveal() for k in range(self.d_in)])
def backward(self, compute_nabla_X=True, batch=None):
N = len(batch)
d = self.d
d_out = self.d_out
X = self.X
Y = self.Y
W = self.W
b = self.b
nabla_X = self.nabla_X
nabla_Y = self.nabla_Y
nabla_W = self.nabla_W
nabla_b = self.nabla_b
if self.activation_layer:
self.activation_layer.backward(batch)
f_schur_Y = self.activation_layer.nabla_X
else:
f_schur_Y = nabla_Y
if compute_nabla_X:
@multithread(self.n_threads, N)
def _(base, size):
B = sfix.Matrix(N, d_out, address=f_schur_Y.address)
nabla_X.assign_part_vector(
B.direct_mul_trans(W, indices=(regint.inc(size, base),
regint.inc(self.d_out),
regint.inc(self.d_out),
regint.inc(self.d_in))),
base)
if self.print_random_update:
print_ln('backward %s', self)
index = regint.get_random(64) % self.nabla_X.total_size()
print_ln('%s nabla_X at %s: %s', str(self.nabla_X),
index, self.nabla_X.to_array()[index].reveal())
progress('nabla X')
self.backward_params(f_schur_Y, batch=batch)
class QuantizedDense(DenseBase):
def __init__(self, N, d_in, d_out):
self.N = N
self.d_in = d_in
self.d_out = d_out
self.d = 1
self.H = math.sqrt(1.5 / (d_in + d_out))
self.W = sfix.Matrix(d_in, d_out)
self.nabla_W = self.W.same_shape()
self.T = sint.Matrix(d_in, d_out)
self.b = sfix.Array(d_out)
self.nabla_b = self.b.same_shape()
self.X = MultiArray([N, 1, d_in], sfix)
self.Y = MultiArray([N, 1, d_out], sfix)
self.nabla_Y = self.Y.same_shape()
def reset(self):
@for_range(self.d_in)
def _(i):
@for_range(self.d_out)
def _(j):
self.W[i][j] = sfix.get_random(-1, 1)
self.b.assign_all(0)
def _forward(self):
@for_range_opt(self.d_in)
def _(i):
@for_range_opt(self.d_out)
def _(j):
over = self.W[i][j] > 0.5
under = self.W[i][j] < -0.5
self.T[i][j] = over.if_else(1, under.if_else(-1, 0))
over = self.W[i][j] > 1
under = self.W[i][j] < -1
self.W[i][j] = over.if_else(1, under.if_else(-1, self.W[i][j]))
@for_range_opt(self.N)
def _(i):
assert self.d_out == 1
self.Y[i][0][0] = self.b[0] + self.H * sfix._new(
sint.dot_product([self.T[j][0] for j in range(self.d_in)],
[self.X[i][0][j].v for j in range(self.d_in)]))
def backward(self, compute_nabla_X=False):
assert not compute_nabla_X
self.backward_params(self.nabla_Y)
class Dropout(NoVariableLayer):
""" Dropout layer.
:param N: number of examples
:param d1: total dimension
:param alpha: probability (power of two)
"""
def __init__(self, N, d1, d2=1, alpha=0.5):
self.N = N
self.d1 = d1
self.d2 = d2
self.X = MultiArray([N, d1, d2], sfix)
self.Y = MultiArray([N, d1, d2], sfix)
self.nabla_Y = MultiArray([N, d1, d2], sfix)
self.nabla_X = MultiArray([N, d1, d2], sfix)
self.alpha = alpha
self.B = MultiArray([N, d1, d2], sint)
def __repr__(self):
return '%s(%s, %s, alpha=%s)' % \
(type(self).__name__, self.N, self.d1, self.alpha)
def forward(self, batch, training=False):
if training:
n_bits = -math.log(self.alpha, 2)
assert n_bits == int(n_bits)
n_bits = int(n_bits)
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
size = self.d1 * self.d2
self.B[i].assign_vector(util.tree_reduce(
util.or_op, (sint.get_random_bit(size=size)
for i in range(n_bits))))
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
self.Y[i].assign_vector(1 / (1 - self.alpha) *
self.X[batch[i]].get_vector() * self.B[i].get_vector())
else:
@for_range(len(batch))
def _(i):
self.Y[i] = self.X[batch[i]]
if self.debug_output:
print_ln('dropout X %s', self.X.reveal_nested())
print_ln('dropout Y %s', self.Y.reveal_nested())
def backward(self, compute_nabla_X=True, batch=None):
if compute_nabla_X:
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
self.nabla_X[batch[i]].assign_vector(
self.nabla_Y[i].get_vector() * self.B[i].get_vector())
if self.debug_output:
print_ln('dropout nabla_Y %s', self.nabla_Y.reveal_nested())
print_ln('dropout nabla_X %s', self.nabla_X.reveal_nested())
class ElementWiseLayer(NoVariableLayer):
def __init__(self, shape, inputs=None):
self.X = Tensor(shape, sfix)
self.Y = Tensor(shape, sfix)
backward_shape = list(shape)
backward_shape[0] = min(shape[0], self.back_batch_size)
self.nabla_X = Tensor(backward_shape, sfix)
self.nabla_Y = Tensor(backward_shape, sfix)
self.inputs = inputs
def f_part(self, base, size):
return self.f(self.X.get_part_vector(base, size))
def f_prime_part(self, base, size):
return self.f_prime(self.Y.get_part_vector(base, size))
def _forward(self, batch=[0]):
n_per_item = reduce(operator.mul, self.X.sizes[1:])
@multithread(self.n_threads, len(batch), max(1, 1000 // n_per_item))
def _(base, size):
self.Y.assign_part_vector(self.f_part(base, size), base)
if self.debug_output:
name = self
@for_range(len(batch))
def _(i):
print_ln('%s X %s %s', name, i, self.X[i].reveal_nested())
print_ln('%s Y %s %s', name, i, self.Y[i].reveal_nested())
def backward(self, batch):
f_prime_bit = MultiArray(self.X.sizes, self.prime_type)
n_elements = len(batch) * reduce(operator.mul, f_prime_bit.sizes[1:])
@multithread(self.n_threads, n_elements)
def _(base, size):
f_prime_bit.assign_vector(self.f_prime_part(base, size), base)
progress('f prime')
@multithread(self.n_threads, n_elements)
def _(base, size):
self.nabla_X.assign_vector(self.nabla_Y.get_vector(base, size) *
f_prime_bit.get_vector(base, size),
base)
progress('f prime schur Y')
if self.debug_output:
name = self
@for_range(len(batch))
def _(i):
print_ln('%s X %s %s', name, i, self.X[i].reveal_nested())
print_ln('%s f_prime %s %s', name, i, f_prime_bit[i].reveal_nested())
print_ln('%s nabla Y %s %s', name, i, self.nabla_Y[i].reveal_nested())
print_ln('%s nabla X %s %s', name, i, self.nabla_X[i].reveal_nested())
class Relu(ElementWiseLayer):
""" Fixed-point ReLU layer.
:param shape: input/output shape (tuple/list of int)
"""
f = staticmethod(relu)
f_prime = staticmethod(relu_prime)
prime_type = sint
comparisons = None
def __init__(self, shape, inputs=None):
super(Relu, self).__init__(shape)
self.comparisons = MultiArray(shape, sint)
def f_part(self, base, size):
x = self.X.get_part_vector(base, size)
c = x > 0
self.comparisons.assign_part_vector(c, base)
return c.if_else(x, 0)
def f_prime_part(self, base, size):
return self.comparisons.get_vector(base, size)
class Square(ElementWiseLayer):
""" Fixed-point square layer.
:param shape: input/output shape (tuple/list of int)
"""
f = staticmethod(lambda x: x ** 2)
f_prime = staticmethod(lambda x: cfix(2, size=x.size) * x)
prime_type = sfix
class MaxPool(NoVariableLayer):
""" Fixed-point MaxPool layer.
:param shape: input shape (tuple/list of four int)
:param strides: strides (tuple/list of four int, first and last must be 1)
:param ksize: kernel size (tuple/list of four int, first and last must be 1)
:param padding: :py:obj:`'VALID'` (default) or :py:obj:`'SAME'`
"""
def __init__(self, shape, strides=(1, 2, 2, 1), ksize=(1, 2, 2, 1),
padding='VALID'):
assert len(shape) == 4
assert min(shape) > 0, shape
for x in strides, ksize:
for i in 0, 3:
assert x[i] == 1
self.X = Tensor(shape, sfix)
if padding == 'SAME':
output_shape = [int(math.ceil(shape[i] / strides[i])) for i in range(4)]
else:
output_shape = [(shape[i] - ksize[i]) // strides[i] + 1 for i in range(4)]
self.Y = Tensor(output_shape, sfix)
self.strides = strides
self.ksize = ksize
self.padding = padding
self.nabla_X = Tensor(shape, sfix)
self.nabla_Y = Tensor(output_shape, sfix)
self.N = shape[0]
self.comparisons = MultiArray([self.N, self.X.sizes[3],
ksize[1] * ksize[2]], sint)
def __repr__(self):
return '%s(%s, strides=%s, ksize=%s, padding=%s)' % \
(type(self).__name__, self.X.sizes, self.strides,
self.ksize, self.padding)
def forward(self, batch=None, training=False):
if batch is None:
batch = Array.create_from(regint(0))
def process(pool, bi, k, i, j):
def m(a, b):
c = a[0] > b[0]
l = [c * x for x in a[1]]
l += [(1 - c) * x for x in b[1]]
return c.if_else(a[0], b[0]), l
red = util.tree_reduce(m, [(x[0], [1] if training else [])
for x in pool])
self.Y[bi][i][j][k] = red[0]
for i, x in enumerate(red[1]):
self.comparisons[bi][k][i] = x
self.traverse(batch, process)
def backward(self, compute_nabla_X=True, batch=None):
if compute_nabla_X:
self.nabla_X.alloc()
def process(pool, bi, k, i, j):
for (x, h_in, w_in, h, w), c in zip(pool,
self.comparisons[bi][k]):
hh = h * h_in
ww = w * w_in
self.nabla_X[bi][hh][ww][k] = \
util.if_else(h_in * w_in, c * self.nabla_Y[bi][i][j][k],
self.nabla_X[bi][hh][ww][k])
self.traverse(batch, process)
def traverse(self, batch, process):
need_padding = [self.strides[i] * (self.Y.sizes[i] - 1) + self.ksize[i] >
self.X.sizes[i] for i in range(4)]
@for_range_opt_multithread(self.n_threads,
[len(batch), self.X.sizes[3]])
def _(l, k):
bi = batch[l]
@for_range_opt(self.Y.sizes[1])
def _(i):
h_base = self.strides[1] * i
@for_range_opt(self.Y.sizes[2])
def _(j):
w_base = self.strides[2] * j
pool = []
for ii in range(self.ksize[1]):
h = h_base + ii
if need_padding[1]:
h_in = h < self.X.sizes[1]
else:
h_in = True
for jj in range(self.ksize[2]):
w = w_base + jj
if need_padding[2]:
w_in = w < self.X.sizes[2]
else:
w_in = True
if not is_zero(h_in * w_in):
pool.append([h_in * w_in * self.X[bi][h_in * h]
[w_in * w][k], h_in, w_in, h, w])
process(pool, bi, k, i, j)
class Argmax(NoVariableLayer):
""" Fixed-point Argmax layer.
:param shape: input shape (tuple/list of two int)
"""
def __init__(self, shape):
assert len(shape) == 2
self.X = MultiArray(shape, sfix)
self.Y = Array(shape[0], sint)
def _forward(self, batch=[0]):
assert len(batch) == 1
self.Y[batch[0]] = argmax(self.X[batch[0]])
class Concat(NoVariableLayer):
""" Fixed-point concatentation layer.
:param inputs: two input layers (tuple/list)
:param dimension: dimension for concatenation (must be 3)
"""
def __init__(self, inputs, dimension):
self.inputs = inputs
self.dimension = dimension
shapes = [inp.shape for inp in inputs]
assert dimension == 3
assert len(shapes) == 2
assert len(shapes[0]) == len(shapes[1])
shape = []
for i in range(len(shapes[0])):
if i == dimension:
shape.append(shapes[0][i] + shapes[1][i])
else:
assert shapes[0][i] == shapes[1][i]
shape.append(shapes[0][i])
self.Y = Tensor(shape, sfix)
def _forward(self, batch=[0]):
assert len(batch) == 1
@for_range_multithread(self.n_threads, 1, self.Y.sizes[1:3])
def _(i, j):
X = [x.Y[batch[0]] for x in self.inputs]
self.Y[batch[0]][i][j].assign_vector(X[0][i][j].get_vector())
self.Y[batch[0]][i][j].assign_part_vector(
X[1][i][j].get_vector(),
len(X[0][i][j]))
class Add(NoVariableLayer):
""" Fixed-point addition layer.
:param inputs: two input layers with same shape (tuple/list)
"""
def __init__(self, inputs):
assert len(inputs) > 1
shape = inputs[0].shape
for inp in inputs:
assert inp.shape == shape
self.Y = Tensor(shape, sfix)
self.inputs = inputs
def _forward(self, batch=[0]):
assert len(batch) == 1
@multithread(self.n_threads, self.Y[0].total_size())
def _(base, size):
tmp = sum(inp.Y[batch[0]].get_vector(base, size)
for inp in self.inputs)
self.Y[batch[0]].assign_vector(tmp, base)
class FusedBatchNorm(Layer):
""" Fixed-point fused batch normalization layer (inference only).
:param shape: input/output shape (tuple/list of four int)
"""
def __init__(self, shape, inputs=None):
assert len(shape) == 4
self.X = Tensor(shape, sfix)
self.Y = Tensor(shape, sfix)
self.weights = sfix.Array(shape[3])
self.bias = sfix.Array(shape[3])
self.inputs = inputs
def input_from(self, player, raw=False):
self.weights.input_from(player, raw=raw)
self.bias.input_from(player, raw=raw)
tmp = sfix.Array(len(self.bias))
tmp.input_from(player, raw=raw)
tmp.input_from(player, raw=raw)
def _forward(self, batch=[0]):
assert len(batch) == 1
@for_range_opt_multithread(self.n_threads, self.X.sizes[1:3])
def _(i, j):
self.Y[batch[0]][i][j].assign_vector(
self.X[batch[0]][i][j].get_vector() * self.weights.get_vector()
+ self.bias.get_vector())
class BatchNorm(Layer):
""" Fixed-point batch normalization layer.
:param shape: input/output shape (tuple/list of four int)
:param approx: use approximate square root
"""
thetas = lambda self: (self.weights, self.bias)
nablas = lambda self: (self.nabla_weights, self.nabla_bias)
def __init__(self, shape, approx=True, args=None):
assert len(shape) in (2, 3, 4)
if len(shape) == 4:
shape = [shape[0], shape[1] * shape[2], shape[3]]
elif len(shape) == 2:
shape = [shape[0], 1, shape[1]]
tensors = (Tensor(shape, sfix) for i in range(4))
self.X, self.Y, self.nabla_X, self.nabla_Y = tensors
arrays = (sfix.Array(shape[2]) for i in range(4))
self.var, self.mu, self.weights, self.bias = arrays
arrays = (sfix.Array(shape[2]) for i in range(4))
self.mu_hat, self.var_hat, self.nabla_weights, self.nabla_bias = arrays
self.epsilon = 2 ** (-sfix.f + 1)
self.momentum = 0.1
if args != None:
approx = 'precisebn' not in args
self.approx = approx
if approx:
print('Approximate square root inverse in batch normalization')
self.InvertSqrt = mpc_math.InvertSqrt
else:
print('Precise square root inverse in batch normalization')
self.InvertSqrt = lambda x: 1 / mpc_math.sqrt(x)
def __repr__(self):
return '%s(%s, approx=%s)' % \
(type(self).__name__, self.X.sizes, self.approx)
def reset(self):
self.bias.assign_all(0)
self.weights.assign_all(1)
self.mu_hat.assign_all(0)
self.var_hat.assign_all(0)
def _output(self, batch, mu, var):
factor = sfix.Array(len(mu))
factor[:] = self.InvertSqrt(var[:] + self.epsilon)
@for_range_opt_multithread(self.n_threads,
[len(batch), self.X.sizes[1]])
def _(i, j):
tmp = self.weights[:] * (self.X[i][j][:] - self.mu[:]) * factor[:]
self.Y[i][j][:] = self.bias[:] + tmp
def forward(self, batch, training=False):
if training:
d = self.X.sizes[1]
d_in = self.X.sizes[2]
s = sfix.Array(d_in)
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
return (self.X[batch[i]][j].get_vector())
s.assign(_())
@multithread(self.n_threads, d_in)
def _(base, size):
self.mu.assign_vector(
s.get_vector(base, size) / (len(batch) * d), base)
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
item = self.X[batch[i]][j].get_vector()
return ((item - self.mu[:]) ** 2)
self.var.assign(_())
@multithread(self.n_threads, d_in)
def _(base, size):
self.var.assign_vector(
self.var.get_vector(base, size) / (len(batch) * d - 1),
base)
for x, y, in (self.mu_hat, self.mu), (self.var_hat, self.var):
x[:] = self.momentum * y[:] + (1 - self.momentum) * x[:]
self._output(batch, self.mu, self.var)
if self.print_random_update:
i = regint.get_random(64) % len(batch)
j = regint.get_random(64) % d
k = regint.get_random(64) % d_in
for x in self.mu, self.var:
print_ln('%s at %s: %s', str(x), k, x[k].reveal())
print_ln('%s at (%s, %s, %s): in=%s out=%s',
str(self.Y), i, j, k, self.X[i][j][k].reveal(),
self.Y[i][j][k].reveal())
else:
self._output(batch, self.mu_hat, self.var_hat)
def backward(self, batch, compute_nabla_X=True):
factor = Array.create_from(
self.InvertSqrt(self.var[:] + self.epsilon))
mynYf = self.X.same_shape()
gamnY = self.X.same_shape()
gamnYd = self.X.same_shape()
nYdf = self.X.same_shape()
d = self.X.sizes[1]
d_in = self.X.sizes[2]
@for_range_opt_multithread(self.n_threads, [len(batch), d])
def _(i, j):
tmp = self.weights[:] * self.nabla_Y[i][j][:]
gamnY[i][j] = tmp
gamnYd[i][j] = tmp * (self.X[i][j][:] - self.mu[:])
mynYf[i][j] = tmp * factor[:]
nYdf[i][j] = self.nabla_Y[i][j][:] * \
(self.X[i][j][:] - self.mu[:]) * factor[:]
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
return (self.nabla_Y[i][j][:])
self.nabla_bias.assign(_())
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
return (nYdf[i][j])
self.nabla_weights.assign(_())
factor3 = Array.create_from(factor[:] ** 3)
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
return (mynYf[i][j])
s1 = Array.create_from(_())
@multithread(self.n_threads, len(s1))
def _(base, size):
s1.assign_vector(s1.get_vector(base, size) / (len(batch) * d), base)
@map_sum_simple(self.n_threads, [len(batch), d], sfix, d_in)
def _(i, j):
return (gamnYd[i][j][:] * factor3[:])
s2 = Array.create_from(_())
@multithread(self.n_threads, len(s2))
def _(base, size):
s2.assign_vector(
s2.get_vector(base, size) / (len(batch) * d - 1), base)
@for_range_opt_multithread(self.n_threads, [len(batch), d])
def _(i, j):
self.nabla_X[i][j][:] = mynYf[i][j][:] \
- s1[:] - (self.X[i][j][:] - self.mu[:]) * s2[:]
if self.print_random_update:
print_ln('backward %s', self)
i = regint.get_random(64) % len(batch)
j = regint.get_random(64) % d
k = regint.get_random(64) % d_in
for x in self.nabla_bias, self.nabla_weights:
print_ln('%s at %s: %s', str(x), k, x[k].reveal())
print_ln('%s at (%s, %s, %s): in=%s out=%s', str(self.Y), i, j, k,
self.nabla_Y[i][j][k].reveal(),
self.nabla_X[i][j][k].reveal())
class QuantBase(object):
bias_before_reduction = True
@staticmethod
def new_squant():
class _(squant):
@classmethod
def get_params_from(cls, player):
cls.set_params(sfloat.get_input_from(player),
sint.get_input_from(player))
@classmethod
def get_input_from(cls, player, size=None):
return cls._new(sint.get_input_from(player, size=size))
return _
def const_div(self, acc, n):
logn = int(math.log(n, 2))
acc = (acc + n // 2)
if 2 ** logn == n:
acc = acc.round(self.output_squant.params.k + logn, logn, nearest=True)
else:
acc = acc.int_div(sint(n), self.output_squant.params.k + logn)
return acc
class FixBase:
bias_before_reduction = False
@staticmethod
def new_squant():
class _(sfix):
params = None
return _
def input_params_from(self, player):
pass
def const_div(self, acc, n):
return (sfix._new(acc) * self.output_squant(1 / n)).v
class BaseLayer(Layer):
def __init__(self, input_shape, output_shape, inputs=None):
self.input_shape = input_shape
self.output_shape = output_shape
self.input_squant = self.new_squant()
self.output_squant = self.new_squant()
self.X = Tensor(input_shape, self.input_squant)
self.Y = Tensor(output_shape, self.output_squant)
back_shapes = list(input_shape), list(output_shape)
for x in back_shapes:
x[0] = min(x[0], self.back_batch_size)
self.nabla_X = MultiArray(back_shapes[0], self.input_squant)
self.nabla_Y = MultiArray(back_shapes[1], self.output_squant)
self.inputs = inputs
def temp_shape(self):
return [0]
@property
def N(self):
return self.input_shape[0]
class ConvBase(BaseLayer):
fewer_rounds = True
use_conv2ds = True
temp_weights = None
temp_inputs = None
thetas = lambda self: (self.weights, self.bias)
nablas = lambda self: (self.nabla_weights, self.nabla_bias)
@classmethod
def init_temp(cls, layers):
size = 0
for layer in layers:
size = max(size, reduce(operator.mul, layer.temp_shape()))
cls.temp_weights = sfix.Array(size)
cls.temp_inputs = sfix.Array(size)
def __init__(self, input_shape, weight_shape, bias_shape, output_shape, stride,
padding='SAME', tf_weight_format=False, inputs=None):
super(ConvBase, self).__init__(input_shape, output_shape, inputs=inputs)
self.weight_shape = weight_shape
self.bias_shape = bias_shape
self.stride = stride
self.tf_weight_format = tf_weight_format
if padding == 'SAME':
# https://web.archive.org/web/20171223022012/https://www.tensorflow.org/api_guides/python/nn
self.padding = []
for i in 1, 2:
s = stride[i - 1]
assert output_shape[i] >= input_shape[i] // s
if tf_weight_format:
w = weight_shape[i - 1]
else:
w = weight_shape[i]
if (input_shape[i] % stride[1] == 0):
pad_total = max(w - s, 0)
else:
pad_total = max(w - (input_shape[i] % s), 0)
self.padding.append(pad_total // 2)
elif padding == 'VALID':
self.padding = [0, 0]
elif isinstance(padding, int):
self.padding = [padding, padding]
else:
self.padding = padding
self.weight_squant = self.new_squant()
self.bias_squant = self.new_squant()
self.weights = Tensor(weight_shape, self.weight_squant)
self.bias = Array(output_shape[-1], self.bias_squant)
self.nabla_weights = Tensor(weight_shape, self.weight_squant)
self.nabla_bias = Array(output_shape[-1], self.bias_squant)
self.unreduced = Tensor(self.output_shape, sint, address=self.Y.address)
if tf_weight_format:
weight_in = weight_shape[2]
else:
weight_in = weight_shape[3]
assert(weight_in == input_shape[-1])
assert(bias_shape[0] == output_shape[-1])
assert(len(bias_shape) == 1)
assert(len(input_shape) == 4)
assert(len(output_shape) == 4)
assert(len(weight_shape) == 4)
def __repr__(self):
return '%s(%s, %s, %s, %s, %s, padding=%s, tf_weight_format=%s)' % \
(type(self).__name__, self.X.sizes, self.weight_shape,
self.bias_shape, self.Y.sizes, self.stride, repr(self.padding),
self.tf_weight_format)
def input_from(self, player, raw=False):
self.input_params_from(player)
self.weights.input_from(player, budget=100000, raw=raw)
if self.input_bias:
self.bias.input_from(player, raw=raw)
def output_weights(self):
self.weights.print_reveal_nested()
print_ln('%s', self.bias.reveal_nested())
def dot_product(self, iv, wv, out_y, out_x, out_c):
bias = self.bias[out_c]
acc = self.output_squant.unreduced_dot_product(iv, wv)
acc.v += bias.v
acc.res_params = self.output_squant.params
#self.Y[0][out_y][out_x][out_c] = acc.reduce_after_mul()
self.unreduced[0][out_y][out_x][out_c] = acc.v
def reduction(self, batch_length=1):
unreduced = self.unreduced
n_summands = self.n_summands()
#start_timer(2)
n_outputs = batch_length * reduce(operator.mul, self.output_shape[1:])
@multithread(self.n_threads, n_outputs,
1000 if sfix.round_nearest else 10 ** 6)
def _(base, n_per_thread):
res = self.input_squant().unreduced(
sint.load_mem(unreduced.address + base,
size=n_per_thread),
self.weight_squant(),
self.output_squant.params,
n_summands).reduce_after_mul()
res.store_in_mem(self.Y.address + base)
#stop_timer(2)
def temp_shape(self):
return list(self.output_shape[1:]) + [self.n_summands()]
def prepare_temp(self):
shape = self.temp_shape()
inputs = MultiArray(shape, self.input_squant,
address=self.temp_inputs)
weights = MultiArray(shape, self.weight_squant,
address=self.temp_weights)
return inputs, weights
class Conv2d(ConvBase):
def n_summands(self):
_, weights_h, weights_w, _ = self.weight_shape
_, inputs_h, inputs_w, n_channels_in = self.input_shape
return weights_h * weights_w * n_channels_in
def _forward(self, batch):
if self.tf_weight_format:
assert(self.weight_shape[3] == self.output_shape[-1])
weights_h, weights_w, _, _ = self.weight_shape
else:
assert(self.weight_shape[0] == self.output_shape[-1])
_, weights_h, weights_w, _ = self.weight_shape
_, inputs_h, inputs_w, n_channels_in = self.input_shape
_, output_h, output_w, n_channels_out = self.output_shape
stride_h, stride_w = self.stride
padding_h, padding_w = self.padding
if self.use_conv2ds:
n_parts = max(1, round((self.n_threads or 1) / n_channels_out))
while len(batch) % n_parts != 0:
n_parts -= 1
print('Convolution in %d parts' % n_parts)
part_size = len(batch) // n_parts
@for_range_multithread(self.n_threads, 1, [n_parts, n_channels_out])
def _(i, j):
inputs = self.X.get_slice_vector(
batch.get_part(i * part_size, part_size))
if self.tf_weight_format:
weights = self.weights.get_vector_by_indices(None, None, None, j)
else:
weights = self.weights.get_part_vector(j)
inputs = inputs.pre_mul()
weights = weights.pre_mul()
res = sint(size = output_h * output_w * part_size)
conv2ds(res, inputs, weights, output_h, output_w,
inputs_h, inputs_w, weights_h, weights_w,
stride_h, stride_w, n_channels_in, padding_h, padding_w,
part_size)
if self.bias_before_reduction:
res += self.bias.expand_to_vector(j, res.size).v
else:
res += self.bias.expand_to_vector(j, res.size).v << \
self.input_squant.f
addresses = regint.inc(res.size,
self.unreduced[i * part_size].address + j,
n_channels_out)
res.store_in_mem(addresses)
self.reduction(len(batch))
if self.debug_output:
print_ln('%s weights %s', self, self.weights.reveal_nested())
print_ln('%s bias %s', self, self.bias.reveal_nested())
@for_range(len(batch))
def _(i):
print_ln('%s X %s %s', self, i, self.X[batch[i]].reveal_nested())
print_ln('%s Y %s %s', self, i, self.Y[i].reveal_nested())
return
else:
assert len(batch) == 1
if self.fewer_rounds:
inputs, weights = self.prepare_temp()
@for_range_opt_multithread(self.n_threads,
[output_h, output_w, n_channels_out])
def _(out_y, out_x, out_c):
in_x_origin = (out_x * stride_w) - padding_w
in_y_origin = (out_y * stride_h) - padding_h
iv = []
wv = []
for filter_y in range(weights_h):
in_y = in_y_origin + filter_y
inside_y = (0 <= in_y) * (in_y < inputs_h)
for filter_x in range(weights_w):
in_x = in_x_origin + filter_x
inside_x = (0 <= in_x) * (in_x < inputs_w)
inside = inside_y * inside_x
if is_zero(inside):
continue
for in_c in range(n_channels_in):
iv += [self.X[0][in_y * inside_y]
[in_x * inside_x][in_c]]
wv += [self.weights[out_c][filter_y][filter_x][in_c]]
wv[-1] *= inside
if self.fewer_rounds:
inputs[out_y][out_x][out_c].assign(iv)
weights[out_y][out_x][out_c].assign(wv)
else:
self.dot_product(iv, wv, out_y, out_x, out_c)
if self.fewer_rounds:
@for_range_opt_multithread(self.n_threads,
list(self.output_shape[1:]))
def _(out_y, out_x, out_c):
self.dot_product(inputs[out_y][out_x][out_c],
weights[out_y][out_x][out_c],
out_y, out_x, out_c)
self.reduction()
class QuantConvBase(QuantBase):
def input_params_from(self, player):
for s in self.input_squant, self.weight_squant, self.bias_squant, self.output_squant:
s.get_params_from(player)
print('WARNING: assuming that bias quantization parameters are correct')
self.output_squant.params.precompute(self.input_squant.params, self.weight_squant.params)
class QuantConv2d(QuantConvBase, Conv2d):
pass
class FixConv2d(Conv2d, FixBase):
""" Fixed-point 2D convolution layer.
:param input_shape: input shape (tuple/list of four int)
:param weight_shape: weight shape (tuple/list of four int)
:param bias_shape: bias shape (tuple/list of one int)
:param output_shape: output shape (tuple/list of four int)
:param stride: stride (tuple/list of two int)
:param padding: :py:obj:`'SAME'` (default), :py:obj:`'VALID'`, or tuple/list of two int
:param tf_weight_format: weight shape format is (height, width, input channels, output channels) instead of the default (output channels, height, width, input channels)
"""
def reset(self):
assert not self.tf_weight_format
kernel_size = self.weight_shape[1] * self.weight_shape[2]
r = math.sqrt(6.0 / (kernel_size * sum(self.weight_shape[::3])))
print('Initializing convolution weights in [%f,%f]' % (-r, r))
self.weights.assign_vector(
sfix.get_random(-r, r, size=self.weights.total_size()))
self.bias.assign_all(0)
def backward(self, compute_nabla_X=True, batch=None):
assert self.use_conv2ds
assert not self.tf_weight_format
_, weights_h, weights_w, _ = self.weight_shape
_, inputs_h, inputs_w, n_channels_in = self.input_shape
_, output_h, output_w, n_channels_out = self.output_shape
stride_h, stride_w = self.stride
padding_h, padding_w = self.padding
N = len(batch)
self.nabla_bias.assign_all(0)
@for_range(N)
def _(i):
self.nabla_bias.assign_vector(
self.nabla_bias.get_vector() + sum(sum(
self.nabla_Y[i][j][k].get_vector() for k in range(output_w))
for j in range(output_h)))
input_size = inputs_h * inputs_w * N
batch_repeat = regint.Matrix(N, inputs_h * inputs_w)
batch_repeat.assign_vector(batch.get(
regint.inc(input_size, 0, 1, 1, N)) *
reduce(operator.mul, self.input_shape[1:]))
@for_range_opt_multithread(self.n_threads, [n_channels_in, n_channels_out])
def _(i, j):
a = regint.inc(input_size, self.X.address + i, n_channels_in, N,
inputs_h * inputs_w)
inputs = sfix.load_mem(batch_repeat.get_vector() + a).pre_mul()
b = regint.inc(N * output_w * output_h, self.nabla_Y.address + j, n_channels_out, N)
rep_out = regint.inc(output_h * output_w * N, 0, 1, 1, N) * \
reduce(operator.mul, self.output_shape[1:])
nabla_outputs = sfix.load_mem(rep_out + b).pre_mul()
res = sint(size = weights_h * weights_w)
conv2ds(res, inputs, nabla_outputs, weights_h, weights_w, inputs_h,
inputs_w, output_h, output_w, -stride_h, -stride_w, N,
padding_h, padding_w, 1)
reduced = unreduced_sfix._new(res).reduce_after_mul()
self.nabla_weights.assign_vector_by_indices(reduced, j, None, None, i)
if compute_nabla_X:
assert tuple(self.stride) == (1, 1)
reverse_weights = MultiArray(
[n_channels_in, weights_h, weights_w, n_channels_out], sfix)
@for_range_opt_multithread(self.n_threads, n_channels_in)
def _(l):
@for_range(weights_h)
def _(j):
@for_range(weights_w)
def _(k):
addresses = regint.inc(n_channels_out,
self.weights[0][j][weights_w-k-1].get_address(l),
reduce(operator.mul, self.weights.sizes[1:]))
reverse_weights[l][weights_h-j-1][k].assign_vector(
self.weights.value_type.load_mem(addresses))
padded_w = inputs_w + 2 * padding_w
padded_h = inputs_h + 2 * padding_h
if padding_h or padding_w:
output = MultiArray(
[N, padded_h, padded_w, n_channels_in], sfix)
else:
output = self.nabla_X
@for_range_opt_multithread(self.n_threads,
[N, n_channels_in])
def _(i, j):
res = sint(size = (padded_w * padded_h))
conv2ds(res, self.nabla_Y[i].get_vector().pre_mul(),
reverse_weights[j].get_vector().pre_mul(),
padded_h, padded_w, output_h, output_w,
weights_h, weights_w, 1, 1, n_channels_out,
weights_h - 1, weights_w - 1, 1)
output.assign_vector_by_indices(
unreduced_sfix._new(res).reduce_after_mul(),
i, None, None, j)
if padding_h or padding_w:
@for_range_opt_multithread(self.n_threads, N)
def _(i):
@for_range(inputs_h)
def _(j):
@for_range(inputs_w)
def _(k):
jj = j + padding_w
kk = k + padding_w
self.nabla_X[i][j][k].assign_vector(
output[i][jj][kk].get_vector())
if self.debug_output:
@for_range(len(batch))
def _(i):
print_ln('%s X %s %s', self, i, list(self.X[i].reveal_nested()))
print_ln('%s nabla Y %s %s', self, i, list(self.nabla_Y[i].reveal_nested()))
if compute_nabla_X:
print_ln('%s nabla X %s %s', self, i, self.nabla_X[batch[i]].reveal_nested())
print_ln('%s nabla weights %s', self,
(self.nabla_weights.reveal_nested()))
print_ln('%s weights %s', self, (self.weights.reveal_nested()))
print_ln('%s nabla b %s', self, (self.nabla_bias.reveal_nested()))
print_ln('%s bias %s', self, (self.bias.reveal_nested()))
class QuantDepthwiseConv2d(QuantConvBase, Conv2d):
def n_summands(self):
_, weights_h, weights_w, _ = self.weight_shape
return weights_h * weights_w
def _forward(self, batch):
assert len(batch) == 1
assert(self.weight_shape[-1] == self.output_shape[-1])
assert(self.input_shape[-1] == self.output_shape[-1])
_, weights_h, weights_w, _ = self.weight_shape
_, inputs_h, inputs_w, n_channels_in = self.input_shape
_, output_h, output_w, n_channels_out = self.output_shape
stride_h, stride_w = self.stride
padding_h, padding_w = self.padding
depth_multiplier = 1
if self.use_conv2ds:
assert depth_multiplier == 1
assert self.weight_shape[0] == 1
@for_range_opt_multithread(self.n_threads, n_channels_in)
def _(j):
inputs = self.X.get_vector_by_indices(0, None, None, j)
assert not self.tf_weight_format
weights = self.weights.get_vector_by_indices(0, None, None,
j)
inputs = inputs.pre_mul()
weights = weights.pre_mul()
res = sint(size = output_h * output_w)
conv2ds(res, inputs, weights, output_h, output_w,
inputs_h, inputs_w, weights_h, weights_w,
stride_h, stride_w, 1, padding_h, padding_w, 1)
res += self.bias.expand_to_vector(j, res.size).v
self.unreduced.assign_vector_by_indices(res, 0, None, None, j)
self.reduction()
return
else:
if self.fewer_rounds:
inputs, weights = self.prepare_temp()
@for_range_opt_multithread(self.n_threads,
[output_h, output_w, n_channels_in])
def _(out_y, out_x, in_c):
for m in range(depth_multiplier):
oc = m + in_c * depth_multiplier
in_x_origin = (out_x * stride_w) - padding_w
in_y_origin = (out_y * stride_h) - padding_h
iv = []
wv = []
for filter_y in range(weights_h):
for filter_x in range(weights_w):
in_x = in_x_origin + filter_x
in_y = in_y_origin + filter_y
inside = (0 <= in_x) * (in_x < inputs_w) * \
(0 <= in_y) * (in_y < inputs_h)
if is_zero(inside):
continue
iv += [self.X[0][in_y][in_x][in_c]]
wv += [self.weights[0][filter_y][filter_x][oc]]
wv[-1] *= inside
if self.fewer_rounds:
inputs[out_y][out_x][oc].assign(iv)
weights[out_y][out_x][oc].assign(wv)
else:
self.dot_product(iv, wv, out_y, out_x, oc)
if self.fewer_rounds:
@for_range_opt_multithread(self.n_threads,
list(self.output_shape[1:]))
def _(out_y, out_x, out_c):
self.dot_product(inputs[out_y][out_x][out_c],
weights[out_y][out_x][out_c],
out_y, out_x, out_c)
self.reduction()
class AveragePool2d(BaseLayer):
def __init__(self, input_shape, output_shape, filter_size, strides=(1, 1)):
super(AveragePool2d, self).__init__(input_shape, output_shape)
self.filter_size = filter_size
self.strides = strides
for i in (0, 1):
if strides[i] == 1:
assert output_shape[1+i] == 1
assert filter_size[i] == input_shape[1+i]
else:
assert strides[i] == filter_size[i]
assert output_shape[1+i] * strides[i] == input_shape[1+i]
def input_from(self, player, raw=False):
self.input_params_from(player)
def _forward(self, batch=[0]):
assert len(batch) == 1
_, input_h, input_w, n_channels_in = self.input_shape
_, output_h, output_w, n_channels_out = self.output_shape
assert n_channels_in == n_channels_out
padding_h, padding_w = (0, 0)
stride_h, stride_w = self.strides
filter_h, filter_w = self.filter_size
n = filter_h * filter_w
print('divisor: ', n)
@for_range_opt_multithread(self.n_threads,
[output_h, output_w, n_channels_in])
def _(out_y, out_x, c):
in_x_origin = (out_x * stride_w) - padding_w
in_y_origin = (out_y * stride_h) - padding_h
fxs = util.max(-in_x_origin, 0)
#fxe = min(filter_w, input_w - in_x_origin)
fys = util.max(-in_y_origin, 0)
#fye = min(filter_h, input_h - in_y_origin)
acc = 0
#fc = 0
for i in range(filter_h):
filter_y = fys + i
for j in range(filter_w):
filter_x = fxs + j
in_x = in_x_origin + filter_x
in_y = in_y_origin + filter_y
acc += self.X[0][in_y][in_x][c].v
#fc += 1
acc = self.const_div(acc, n)
self.Y[0][out_y][out_x][c] = self.output_squant._new(acc)
class QuantAveragePool2d(QuantBase, AveragePool2d):
def input_params_from(self, player):
print('WARNING: assuming that input and output quantization parameters are the same')
for s in self.input_squant, self.output_squant:
s.get_params_from(player)
class FixAveragePool2d(FixBase, AveragePool2d):
""" Fixed-point 2D AvgPool layer.
:param input_shape: input shape (tuple/list of four int)
:param output_shape: output shape (tuple/list of four int)
:param filter_size: filter size (tuple/list of two int)
:param strides: strides (tuple/list of two int)
"""
class QuantReshape(QuantBase, BaseLayer):
def __init__(self, input_shape, _, output_shape):
super(QuantReshape, self).__init__(input_shape, output_shape)
def input_from(self, player):
print('WARNING: assuming that input and output quantization parameters are the same')
_ = self.new_squant()
for s in self.input_squant, _, self.output_squant:
s.set_params(sfloat.get_input_from(player), sint.get_input_from(player))
for i in range(2):
sint.get_input_from(player)
def _forward(self, batch):
assert len(batch) == 1
# reshaping is implicit
self.Y.assign(self.X)
class QuantSoftmax(QuantBase, BaseLayer):
def input_from(self, player):
print('WARNING: assuming that input and output quantization parameters are the same')
for s in self.input_squant, self.output_squant:
s.set_params(sfloat.get_input_from(player), sint.get_input_from(player))
def _forward(self, batch):
assert len(batch) == 1
assert(len(self.input_shape) == 2)
# just print the best
def comp(left, right):
c = left[1].v.greater_than(right[1].v, self.input_squant.params.k)
#print_ln('comp %s %s %s', c.reveal(), left[1].v.reveal(), right[1].v.reveal())
return [c.if_else(x, y) for x, y in zip(left, right)]
print_ln('guess: %s', util.tree_reduce(comp, list(enumerate(self.X[0])))[0].reveal())
class Optimizer:
""" Base class for graphs of layers. """
n_threads = Layer.n_threads
always_shuffle = True
time_layers = False
revealing_correctness = False
early_division = False
@staticmethod
def from_args(program, layers):
if 'adam' in program.args or 'adamapprox' in program.args:
res = Adam(layers, 1, approx='adamapprox' in program.args)
elif 'amsgrad' in program.args:
res = Adam(layers, approx=True, amsgrad=True)
elif 'quotient' in program.args:
res = Adam(layers, approx=True, amsgrad=True, normalize=True)
else:
res = SGD(layers, 1)
res.early_division = 'early_div' in program.args
return res
def __init__(self, report_loss=None):
if get_program().options.binary:
raise CompilerError(
'machine learning code not compatible with binary circuits')
self.tol = 0.000
self.report_loss = report_loss
self.X_by_label = None
self.print_update_average = False
self.print_random_update = False
self.print_losses = False
self.print_loss_reduction = False
self.i_epoch = MemValue(0)
self.stopped_on_loss = MemValue(0)
self.stopped_on_low_loss = MemValue(0)
@property
def layers(self):
""" Get all layers. """
return self._layers
@layers.setter
def layers(self, layers):
""" Construct linear graph from list of layers. """
self._layers = layers
self.thetas = []
prev = None
for layer in layers:
if not layer.inputs and prev is not None:
layer.inputs = [prev]
prev = layer
self.thetas.extend(layer.thetas())
def set_layers_with_inputs(self, layers):
""" Construct graph from :py:obj:`inputs` members of list of layers. """
self._layers = layers
used = set([None])
for layer in reversed(layers):
layer.last_used = list(filter(lambda x: x not in used, layer.inputs))
used.update(layer.inputs)
def reset(self):
""" Initialize weights. """
for layer in self.layers:
layer.reset()
self.i_epoch.write(0)
self.stopped_on_loss.write(0)
def batch_for(self, layer, batch):
if layer in (self.layers[0], self.layers[-1]):
assert not isinstance(layer, BatchNorm)
return batch
else:
batch = regint.Array(len(batch))
batch.assign(regint.inc(len(batch)))
return batch
@_no_mem_warnings
def forward(self, N=None, batch=None, keep_intermediate=True,
model_from=None, training=False, run_last=True):
""" Compute graph.
:param N: batch size (used if batch not given)
:param batch: indices for computation (:py:class:`~Compiler.types.Array` or list)
:param keep_intermediate: do not free memory of intermediate results after use
"""
if batch is None:
batch = regint.Array(N)
batch.assign(regint.inc(N))
for i, layer in enumerate(self.layers):
if layer.inputs and len(layer.inputs) == 1 and layer.inputs[0] is not None:
layer._X.address = layer.inputs[0].Y.address
layer.Y.alloc()
if model_from is not None:
layer.input_from(model_from)
break_point()
if self.time_layers:
start_timer(100 + i)
if i != len(self.layers) - 1 or run_last:
layer.forward(batch=self.batch_for(layer, batch),
training=training)
if self.print_random_update:
print_ln('forward layer %s', layer)
l = min(100, layer.Y[i].total_size())
i = regint.get_random(64) % len(batch)
if l < 100:
j = 0
else:
j = regint.get_random(64) % \
(layer.Y[i].total_size() - l)
print_ln('forward layer %s at (%s, %s): %s', layer, i, j,
layer.Y[i].to_array().get_vector(j, l).reveal())
i = regint.get_random(64) % layer.Y[0].total_size()
print_ln('forward layer %s vertical at %s: %s', layer, i,
[layer.Y[j].to_array()[i].reveal()
for j in range(len(batch))])
if self.time_layers:
stop_timer(100 + i)
break_point()
if not keep_intermediate:
for l in layer.last_used:
l.Y.delete()
for theta in layer.thetas():
theta.delete()
@_no_mem_warnings
def eval(self, data, batch_size=None, top=False):
""" Compute evaluation after training.
:param data: sample data (:py:class:`Compiler.types.Matrix` with one row per sample)
:param top: return top prediction instead of probability distribution
"""
if isinstance(self.layers[-1].Y, Array) or top:
if top:
res = sint.Array(len(data))
else:
res = sfix.Array(len(data))
else:
res = sfix.Matrix(len(data), self.layers[-1].d_out)
def f(start, batch_size, batch):
batch.assign_vector(regint.inc(batch_size, start))
self.forward(batch=batch, run_last=not top)
part = self.layers[-1].eval(batch_size, top=top)
res.assign_part_vector(part.get_vector(), start)
self.run_in_batches(f, data, batch_size or len(self.layers[1].X))
return res
@_no_mem_warnings
def backward(self, batch):
""" Compute backward propagation. """
for i, layer in reversed(list(enumerate(self.layers))):
assert len(batch) <= layer.back_batch_size
if self.time_layers:
start_timer(200 + i)
if not layer.inputs:
layer.backward(compute_nabla_X=False,
batch=self.batch_for(layer, batch))
else:
layer.backward(batch=self.batch_for(layer, batch))
if len(layer.inputs) == 1:
layer.inputs[0].nabla_Y.address = \
layer.nabla_X.address
if i == len(self.layers) - 1 and self.early_division:
layer.nabla_X.assign_vector(
layer.nabla_X.get_vector() / len(batch))
if self.time_layers:
stop_timer(200 + i)
@_no_mem_warnings
def run(self, batch_size=None, stop_on_loss=0):
""" Run training.
:param batch_size: batch size (defaults to example size of first layer)
:param stop_on_loss: stop when loss falls below this (default: 0)
"""
if self.n_epochs == 0:
return
if batch_size is not None:
N = batch_size
else:
N = self.layers[0].N
i = self.i_epoch
n_iterations = MemValue(0)
self.n_correct = MemValue(0)
@for_range(self.n_epochs)
def _(_):
if self.X_by_label is None:
self.X_by_label = [[None] * self.layers[0].N]
assert len(self.X_by_label) in (1, 2)
assert N % len(self.X_by_label) == 0
n = N // len(self.X_by_label)
n_per_epoch = int(math.ceil(1. * max(len(X) for X in
self.X_by_label) / n))
print('%d runs per epoch' % n_per_epoch)
indices_by_label = []
for label, X in enumerate(self.X_by_label):
indices = regint.Array(n * n_per_epoch)
indices_by_label.append(indices)
indices.assign(regint.inc(len(X)))
missing = len(indices) - len(X)
if missing:
indices.assign_vector(
regint.get_random(int(math.log2(len(X))), size=missing),
base=len(X))
if self.always_shuffle or n_per_epoch > 1:
indices.shuffle()
loss_sum = MemValue(sfix(0))
self.n_correct.write(0)
@for_range(n_per_epoch)
def _(j):
n_iterations.iadd(1)
batch = regint.Array(N)
for label, X in enumerate(self.X_by_label):
indices = indices_by_label[label]
batch.assign(indices.get_vector(j * n, n) +
regint(label * len(self.X_by_label[0]), size=n),
label * n)
self.forward(batch=batch, training=True)
self.backward(batch=batch)
if self.time_layers:
start_timer(1000)
self.update(i, batch=batch)
if self.time_layers:
stop_timer(1000)
loss_sum.iadd(self.layers[-1].l)
if self.print_loss_reduction:
before = self.layers[-1].average_loss(N)
self.forward(batch=batch)
after = self.layers[-1].average_loss(N)
print_ln('loss reduction in batch %s: %s (%s - %s)', j,
before - after, before, after)
elif self.print_losses:
print_str('\rloss in batch %s: %s/%s', j,
self.layers[-1].average_loss(N),
loss_sum.reveal() / (j + 1))
if self.revealing_correctness:
part_truth = self.layers[-1].Y.same_shape()
part_truth.assign_vector(
self.layers[-1].Y.get_slice_vector(batch))
self.n_correct.iadd(
self.layers[-1].reveal_correctness(batch_size, part_truth))
if stop_on_loss:
loss = self.layers[-1].average_loss(N)
res = (loss < stop_on_loss) * (loss >= -1)
self.stopped_on_loss.write(1 - res)
return res
if self.print_losses:
print_ln()
if self.report_loss and self.layers[-1].compute_loss and self.layers[-1].approx != 5:
print_ln('loss in epoch %s: %s', i,
(loss_sum.reveal() * cfix(1 / n_per_epoch)))
else:
print_ln('done with epoch %s', i)
time()
i.iadd(1)
res = True
if self.tol > 0:
res *= (1 - (loss_sum >= 0) * \
(loss_sum < self.tol * n_per_epoch)).reveal()
self.stopped_on_low_loss.write(1 - res)
return res
def reveal_correctness(self, data, truth, batch_size):
N = data.sizes[0]
n_correct = MemValue(0)
loss = MemValue(sfix(0))
def f(start, batch_size, batch):
batch.assign_vector(regint.inc(batch_size, start))
self.forward(batch=batch)
part_truth = truth.get_part(start, batch_size)
n_correct.iadd(
self.layers[-1].reveal_correctness(batch_size, part_truth))
loss.iadd(self.layers[-1].l * batch_size)
self.run_in_batches(f, data, batch_size)
loss = loss.reveal()
if cfix.f < 31:
loss = cfix._new(loss.v << (31 - cfix.f), k=63, f=31)
return n_correct, loss / N
def run_in_batches(self, f, data, batch_size, truth=None):
training_data = self.layers[0].X.address
training_truth = self.layers[-1].Y.address
self.layers[0].X.address = data.address
if truth:
self.layers[-1].Y.address = truth.address
N = data.sizes[0]
batch = regint.Array(batch_size)
@for_range(N // batch_size)
def _(i):
start = i * batch_size
f(start, batch_size, batch)
batch_size = N % batch_size
if batch_size:
start = N - batch_size
f(start, batch_size, batch)
self.layers[0].X.address = training_data
self.layers[-1].Y.address = training_truth
@_no_mem_warnings
def run_by_args(self, program, n_runs, batch_size, test_X, test_Y,
acc_batch_size=None):
if acc_batch_size is None:
acc_batch_size = batch_size
depreciation = None
for arg in program.args:
m = re.match('rate(.*)', arg)
if m:
self.gamma = MemValue(cfix(float(m.group(1))))
m = re.match('dep(.*)', arg)
if m:
depreciation = float(m.group(1))
if 'nomom' in program.args:
self.momentum = 0
self.print_losses = 'print_losses' in program.args
self.print_random_update = 'print_random_update' in program.args
Layer.print_random_update = self.print_random_update
self.time_layers = 'time_layers' in program.args
self.revealing_correctness = not 'no_acc' in program.args
self.layers[-1].compute_loss = not 'no_loss' in program.args
if 'full_cisc' in program.args:
program.options.keep_cisc = 'FPDiv,exp2_fx,log2_fx'
model_input = 'model_input' in program.args
acc_first = model_input and not 'train_first' in program.args
if model_input:
for layer in self.layers:
layer.input_from(0)
else:
self.reset()
if 'one_iter' in program.args:
print_float_prec(16)
self.output_weights()
print_ln('loss')
self.eval(
self.layers[0].X.get_part(0, batch_size),
batch_size=batch_size).print_reveal_nested()
for layer in self.layers:
layer.X.get_part(0, batch_size).print_reveal_nested()
print_ln('%s', self.layers[-1].Y.get_part(0, batch_size).reveal_nested())
batch = Array.create_from(regint.inc(batch_size))
self.forward(batch=batch, training=True)
self.backward(batch=batch)
self.update(0, batch=batch)
print_ln('loss %s', self.layers[-1].l.reveal())
self.output_weights()
return
if 'bench10' in program.args or 'bench1' in program.args:
n = 1 if 'bench1' in program.args else 10
print('benchmarking %s iterations' % n)
@for_range(n)
def _(i):
batch = Array.create_from(regint.inc(batch_size))
self.forward(batch=batch, training=True)
self.backward(batch=batch)
self.update(0, batch=batch)
return
@for_range(n_runs)
def _(i):
if not acc_first:
start_timer(1)
self.run(batch_size,
stop_on_loss=0 if 'no_loss' in program.args else 100)
stop_timer(1)
if 'no_acc' in program.args:
return
N = self.layers[0].X.sizes[0]
n_trained = (N + batch_size - 1) // batch_size * batch_size
if not acc_first:
print_ln('train_acc: %s (%s/%s)',
cfix(self.n_correct, k=63, f=31) / n_trained,
self.n_correct, n_trained)
if test_X and test_Y:
print('use test set')
n_test = len(test_Y)
n_correct, loss = self.reveal_correctness(test_X, test_Y,
acc_batch_size)
print_ln('test loss: %s', loss)
print_ln('acc: %s (%s/%s)',
cfix(n_correct, k=63, f=31) / n_test,
n_correct, n_test)
if acc_first:
start_timer(1)
self.run(batch_size)
stop_timer(1)
else:
@if_(util.or_op(self.stopped_on_loss, n_correct <
int(n_test // self.layers[-1].n_outputs * 1.2)))
def _():
self.gamma.imul(.5)
if 'crash' in program.args:
@if_(self.gamma == 0)
def _():
runtime_error('diverging')
self.reset()
print_ln('reset after reducing learning rate to %s',
self.gamma)
if depreciation:
self.gamma.imul(depreciation)
print_ln('reducing learning rate to %s', self.gamma)
return 1 - self.stopped_on_low_loss
if 'model_output' in program.args:
self.output_weights()
def output_weights(self):
print_float_precision(max(6, sfix.f // 3))
for layer in self.layers:
layer.output_weights()
def summary(self):
sizes = [var.total_size() for var in self.thetas]
print(sizes)
print('Trainable params:', sum(sizes))
class Adam(Optimizer):
""" Adam/AMSgrad optimizer.
:param layers: layers of linear graph
:param approx: use approximation for inverse square root (bool)
:param amsgrad: use AMSgrad (bool)
"""
def __init__(self, layers, n_epochs=1, approx=False, amsgrad=False,
normalize=False):
self.gamma = MemValue(cfix(.001))
self.beta1 = 0.9
self.beta2 = 0.999
self.beta1_power = MemValue(cfix(1))
self.beta2_power = MemValue(cfix(1))
self.epsilon = max(2 ** -((sfix.k - sfix.f - 8) / (1 + approx)), 10 ** -8)
self.n_epochs = n_epochs
self.approx = approx
self.amsgrad = amsgrad
self.normalize = normalize
if amsgrad:
print_str('Using AMSgrad ')
else:
print_str('Using Adam ')
if approx:
print_ln('with inverse square root approximation')
else:
print_ln('with more precise inverse square root')
if normalize:
print_ln('Normalize gradient')
self.layers = layers
self.ms = []
self.vs = []
self.gs = []
self.vhats = []
for layer in layers:
for nabla in layer.nablas():
self.gs.append(nabla)
for x in self.ms, self.vs:
x.append(nabla.same_shape())
if amsgrad:
self.vhats.append(nabla.same_shape())
super(Adam, self).__init__()
def update(self, i_epoch, batch):
self.beta1_power *= self.beta1
self.beta2_power *= self.beta2
m_factor = MemValue(1 / (1 - self.beta1_power))
v_factor = MemValue(1 / (1 - self.beta2_power))
for i_layer, (m, v, g, theta) in enumerate(zip(self.ms, self.vs,
self.gs, self.thetas)):
if self.normalize:
abs_g = g.same_shape()
@multithread(self.n_threads, g.total_size())
def _(base, size):
abs_g.assign_vector(abs(g.get_vector(base, size)), base)
max_g = tree_reduce_multithread(self.n_threads,
util.max, abs_g.get_vector())
scale = MemValue(sfix._new(library.AppRcr(
max_g.v, max_g.k, max_g.f, simplex_flag=True)))
@multithread(self.n_threads, m.total_size(),
max_size=get_program().budget)
def _(base, size):
m_part = m.get_vector(base, size)
v_part = v.get_vector(base, size)
g_part = g.get_vector(base, size)
if self.normalize:
g_part *= scale.expand_to_vector(size)
m_part = self.beta1 * m_part + (1 - self.beta1) * g_part
v_part = self.beta2 * v_part + (1 - self.beta2) * g_part ** 2
m.assign_vector(m_part, base)
v.assign_vector(v_part, base)
if self.amsgrad:
vhat = self.vhats [i_layer].get_vector(base, size)
vhat = util.max(vhat, v_part)
self.vhats[i_layer].assign_vector(vhat, base)
diff = self.gamma.expand_to_vector(size) * m_part
else:
mhat = m_part * m_factor.expand_to_vector(size)
vhat = v_part * v_factor.expand_to_vector(size)
diff = self.gamma.expand_to_vector(size) * mhat
if self.approx:
diff *= mpc_math.InvertSqrt(vhat + self.epsilon ** 2)
else:
diff /= mpc_math.sqrt(vhat) + self.epsilon
theta.assign_vector(theta.get_vector(base, size) - diff, base)
class SGD(Optimizer):
""" Stochastic gradient descent.
:param layers: layers of linear graph
:param n_epochs: number of epochs for training
:param report_loss: disclose and print loss
"""
def __init__(self, layers, n_epochs, debug=False, report_loss=None):
self.momentum = 0.9
self.layers = layers
self.n_epochs = n_epochs
self.nablas = []
self.delta_thetas = []
for layer in layers:
self.nablas.extend(layer.nablas())
for theta in layer.thetas():
self.delta_thetas.append(theta.same_shape())
self.gamma = MemValue(cfix(0.01))
self.debug = debug
super(SGD, self).__init__(report_loss)
@_no_mem_warnings
def reset(self, X_by_label=None):
""" Reset layer parameters.
:param X_by_label: if given, set training data by public labels for balancing
"""
self.X_by_label = X_by_label
if X_by_label is not None:
for label, X in enumerate(X_by_label):
@for_range_multithread(self.n_threads, 1, len(X))
def _(i):
j = i + label * len(X_by_label[0])
self.layers[0].X[j] = X[i]
self.layers[-1].Y[j] = label
for y in self.delta_thetas:
y.assign_all(0)
super(SGD, self).reset()
def update(self, i_epoch, batch):
for nabla, theta, delta_theta in zip(self.nablas, self.thetas,
self.delta_thetas):
@multithread(self.n_threads, nabla.total_size())
def _(base, size):
old = delta_theta.get_vector(base, size)
red_old = self.momentum * old
rate = self.gamma.expand_to_vector(size)
nabla_vector = nabla.get_vector(base, size)
log_batch_size = math.log(len(batch), 2)
# divide by len(batch) by truncation
# increased rate if len(batch) is not a power of two
pre_trunc = nabla_vector.v * rate.v
k = max(nabla_vector.k, rate.k) + rate.f
m = rate.f + int(log_batch_size)
if self.early_division:
v = pre_trunc
else:
v = pre_trunc.round(k, m, signed=True,
nearest=sfix.round_nearest)
new = nabla_vector._new(v)
diff = red_old - new
delta_theta.assign_vector(diff, base)
theta.assign_vector(theta.get_vector(base, size) +
delta_theta.get_vector(base, size), base)
if self.print_update_average:
vec = abs(delta_theta.get_vector().reveal())
print_ln('update average: %s (%s)',
sum(vec) * cfix(1 / len(vec)), len(vec))
if self.debug:
limit = int(self.debug)
d = delta_theta.get_vector().reveal()
aa = [cfix.Array(len(d.v)) for i in range(3)]
a = aa[0]
a.assign(d)
@for_range(len(a))
def _(i):
x = a[i]
print_ln_if((x > limit) + (x < -limit),
'update epoch=%s %s index=%s %s',
i_epoch.read(), str(delta_theta), i, x)
a = aa[1]
a.assign(nabla.get_vector().reveal())
@for_range(len(a))
def _(i):
x = a[i]
print_ln_if((x > len(batch) * limit) + (x < -len(batch) * limit),
'nabla epoch=%s %s index=%s %s',
i_epoch.read(), str(nabla), i, x)
a = aa[2]
a.assign(theta.get_vector().reveal())
@for_range(len(a))
def _(i):
x = a[i]
print_ln_if((x > limit) + (x < -limit),
'theta epoch=%s %s index=%s %s',
i_epoch.read(), str(theta), i, x)
if self.print_random_update:
print_ln('update')
l = min(100, nabla.total_size())
if l < 100:
index = 0
else:
index = regint.get_random(64) % (nabla.total_size() - l)
print_ln('%s at %s: nabla=%s update=%s theta=%s', str(theta),
index, nabla.to_array().get_vector(index, l).reveal(),
delta_theta.to_array().get_vector(index, l).reveal(),
theta.to_array().get_vector(index, l).reveal())
self.gamma.imul(1 - 10 ** - 6)
def apply_padding(input_shape, kernel_size, strides, padding):
if isinstance(padding, int):
input_shape = [x + 2 * padding for x in input_shape]
padding = 'valid'
if padding == 'valid':
res = (input_shape[0] - kernel_size[0] + 1) // strides[0], \
(input_shape[1] - kernel_size[1] + 1) // strides[1],
assert min(res) > 0, (input_shape, kernel_size, strides, padding)
return res
elif padding == 'same':
return (input_shape[0]) // strides[0], \
(input_shape[1]) // strides[1],
else:
raise Exception('invalid padding: %s' % padding)
class keras:
class layers:
Flatten = lambda *args, **kwargs: ('flatten', args, kwargs)
Dense = lambda *args, **kwargs: ('dense', args, kwargs)
def Conv2D(filters, kernel_size, strides=(1, 1), padding='valid',
activation=None, input_shape=None):
return 'conv2d', {'filters': filters, 'kernel_size': kernel_size,
'strides': strides, 'padding': padding,
'activation': activation}
def MaxPooling2D(pool_size=2, strides=None, padding='valid'):
return 'maxpool', {'pool_size': pool_size, 'strides': strides,
'padding': padding}
def Dropout(rate):
l = math.log(rate, 2)
if int(l) != l:
raise Exception('rate needs to be a power of two')
return 'dropout', rate
def Activation(activation):
assert(activation == 'relu')
return activation,
def BatchNormalization():
return 'batchnorm',
class optimizers:
SGD = lambda *args, **kwargs: ('sgd', args, kwargs)
Adam = lambda *args, **kwargs: ('adam', args, kwargs)
class models:
class Sequential:
def __init__(self, layers):
self.layers = layers
self.optimizer = None
self.opt = None
def compile(self, optimizer):
self.optimizer = optimizer
def compile_by_args(self, program):
if 'adam' in program.args:
self.optimizer = 'adam', [], {}
elif 'amsgrad' in program.args:
self.optimizer = 'adam', [], {'amsgrad': True}
else:
self.optimizer = 'sgd', [], {}
@property
def trainable_variables(self):
if self.opt == None:
raise Exception('need to run build() or fit() first')
return list(self.opt.thetas)
def summary(self):
self.opt.summary()
def build(self, input_shape, batch_size=128):
data_input_shape = input_shape
if self.opt != None and \
input_shape == self.opt.layers[0].X.sizes and \
batch_size <= self.batch_size and \
type(self.opt).__name__.lower() == self.optimizer[0]:
return
if self.optimizer == None:
self.optimizer = 'inference', [], {}
if input_shape == None:
raise Exception('must specify number of samples')
Layer.back_batch_size = batch_size
layers = []
for i, layer in enumerate(self.layers):
name = layer[0]
if name == 'dense':
if len(layers) == 0:
N = input_shape[0]
n_units = reduce(operator.mul, input_shape[1:])
else:
N = batch_size
n_units = reduce(operator.mul,
layers[-1].Y.sizes[1:])
if i == len(self.layers) - 1:
if layer[2].get('activation', 'softmax') in \
('softmax', 'sigmoid'):
layer[2].pop('activation', None)
layers.append(Dense(N, n_units, layer[1][0],
**layer[2]))
input_shape = layers[-1].Y.sizes
elif name == 'conv2d':
input_shape = list(input_shape) + \
[1] * (4 - len(input_shape))
print (layer[1])
kernel_size = layer[1]['kernel_size']
filters = layer[1]['filters']
strides = layer[1]['strides']
padding = layer[1]['padding']
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
if isinstance(strides, int):
strides = (strides, strides)
weight_shape = [filters] + list(kernel_size) + \
[input_shape[-1]]
output_shape = [batch_size] + list(
apply_padding(input_shape[1:3], kernel_size,
strides, padding)) + [filters]
padding = padding.upper() if isinstance(padding, str) \
else padding
layers.append(FixConv2d(input_shape, weight_shape,
(filters,), output_shape,
strides, padding))
input_shape = output_shape
print('conv output shape', output_shape)
elif name == 'maxpool':
pool_size = layer[1]['pool_size']
strides = layer[1]['strides']
padding = layer[1]['padding']
if isinstance(pool_size, int):
pool_size = (pool_size, pool_size)
if isinstance(strides, int):
strides = (strides, strides)
if strides == None:
strides = pool_size
layers.append(MaxPool(input_shape,
[1] + list(strides) + [1],
[1] + list(pool_size) + [1],
padding))
input_shape = layers[-1].Y.sizes
elif name == 'dropout':
layers.append(Dropout(batch_size, reduce(
operator.mul, layers[-1].Y.sizes[1:]),
alpha=layer[1]))
input_shape = layers[-1].Y.sizes
elif name == 'flatten':
pass
elif name == 'relu':
layers.append(Relu(layers[-1].Y.sizes))
elif name == 'batchnorm':
input_shape = layers[-1].Y.sizes
layers.append(BatchNorm(layers[-1].Y.sizes))
else:
raise Exception(layer[0] + ' not supported')
if layers[-1].d_out == 1:
layers.append(Output(data_input_shape[0]))
else:
layers.append(
MultiOutput(data_input_shape[0], layers[-1].d_out))
if self.optimizer[1]:
raise Exception('use keyword arguments for optimizer')
opt = self.optimizer[0]
opts = self.optimizer[2]
if opt == 'sgd':
opt = SGD(layers, 1)
momentum = opts.pop('momentum', None)
if momentum != None:
opt.momentum = momentum
elif opt == 'adam':
opt = Adam(layers, amsgrad=opts.pop('amsgrad', None),
approx=True)
beta1 = opts.pop('beta_1', None)
beta2 = opts.pop('beta_2', None)
epsilon = opts.pop('epsilon', None)
if beta1 != None:
opt.beta1 = beta1
if beta2:
opt.beta2 = beta2
if epsilon:
if epsilon < opt.epsilon:
print('WARNING: epsilon smaller than default might '
'cause overflows')
opt.epsilon = epsilon
elif opt == 'inference':
opt = Optimizer()
opt.layers = layers
else:
raise Exception(opt + ' not supported')
lr = opts.pop('learning_rate', None)
if lr != None:
opt.gamma = MemValue(cfix(lr))
if opts:
raise Exception(opts + ' not supported')
self.batch_size = batch_size
self.opt = opt
def fit(self, x, y, batch_size, epochs=1, validation_data=None):
assert len(x) == len(y)
self.build(x.sizes, batch_size)
if x.total_size() != self.opt.layers[0].X.total_size():
raise Exception('sample data size mismatch')
if y.total_size() != self.opt.layers[-1].Y.total_size():
print (y, self.opt.layers[-1].Y)
raise Exception('label size mismatch')
if validation_data == None:
validation_data = None, None
else:
if len(validation_data[0]) != len(validation_data[1]):
raise Exception('test set size mismatch')
self.opt.layers[0].X.address = x.address
self.opt.layers[-1].Y.address = y.address
self.opt.run_by_args(get_program(), epochs, batch_size,
validation_data[0], validation_data[1],
batch_size)
return self.opt
def predict(self, x, batch_size=None):
if self.opt == None:
raise Exception('need to run fit() or build() first')
if batch_size != None:
batch_size = min(batch_size, self.batch_size)
return self.opt.eval(x, batch_size=batch_size)
def solve_linear(A, b, n_iterations, progress=False, n_threads=None,
stop=False, already_symmetric=False, precond=False):
""" Iterative linear solution approximation for :math:`Ax=b`.
:param progress: print some information on the progress (implies revealing)
:param n_threads: number of threads to use
:param stop: whether to stop when converged (implies revealing)
"""
assert len(b) == A.sizes[0]
x = sfix.Array(A.sizes[1])
x.assign_vector(sfix.get_random(-1, 1, size=len(x)))
if already_symmetric:
AtA = A
r = Array.create_from(b - AtA * x)
else:
AtA = sfix.Matrix(len(x), len(x))
A.trans_mul_to(A, AtA, n_threads=n_threads)
r = Array.create_from(A.transpose() * b - AtA * x)
if precond:
return solve_linear_diag_precond(AtA, b, x, r, n_iterations,
progress, stop)
v = sfix.Array(A.sizes[1])
v.assign_all(0)
Av = sfix.Array(len(x))
@for_range(n_iterations)
def _(i):
v[:] = r - sfix.dot_product(r, Av) / sfix.dot_product(v, Av) * v
Av[:] = AtA * v
v_norm = sfix.dot_product(v, Av)
vr = sfix.dot_product(v, r)
alpha = (v_norm == 0).if_else(0, vr / v_norm)
x[:] = x + alpha * v
r[:] = r - alpha * Av
if progress:
print_ln('%s alpha=%s vr=%s v_norm=%s', i, alpha.reveal(),
vr.reveal(), v_norm.reveal())
if stop:
return (alpha > 0).reveal()
return x
def solve_linear_diag_precond(A, b, x, r, n_iterations, progress=False,
stop=False):
m = 1 / A.diag()
mr = Array.create_from(m * r[:])
d = Array.create_from(mr)
@for_range(n_iterations)
def _(i):
Ad = A * d
d_norm = sfix.dot_product(d, Ad)
alpha = (d_norm == 0).if_else(0, sfix.dot_product(r, mr) / d_norm)
x[:] = x[:] + alpha * d[:]
r_norm = sfix.dot_product(r, mr)
r[:] = r[:] - alpha * Ad
tmp = m * r[:]
beta = (r_norm == 0).if_else(0, sfix.dot_product(r, tmp) / r_norm)
mr[:] = tmp
d[:] = tmp + beta * d
if progress:
print_ln('%s alpha=%s beta=%s r_norm=%s d_norm=%s', i,
alpha.reveal(), beta.reveal(), r_norm.reveal(),
d_norm.reveal())
if stop:
return (alpha > 0).reveal()
return x
def mr(A, n_iterations, stop=False):
""" Iterative matrix inverse approximation.
:param A: matrix to invert
:param n_iterations: maximum number of iterations
:param stop: whether to stop when converged (implies revealing)
"""
assert len(A.sizes) == 2
assert A.sizes[0] == A.sizes[1]
M = A.same_shape()
n = A.sizes[0]
@for_range(n)
def _(i):
e = sfix.Array(n)
e.assign_all(0)
e[i] = 1
M[i] = solve_linear(A, e, n_iterations, stop=stop)
return M.transpose()
def var(x):
""" Variance. """
mean = MemValue(type(x[0])(0))
@for_range_opt(len(x))
def _(i):
mean.iadd(x[i])
mean /= len(x)
res = MemValue(type(x[0])(0))
@for_range_opt(len(x))
def _(i):
res.iadd((x[i] - mean.read()) ** 2)
return res.read()