Merge FlexDropout into Dropout

Update mnist_full examples using Dropout
Clean up code style
This commit is contained in:
Hidde L
2025-10-21 16:26:15 +02:00
parent ba2b440712
commit e17e175b16
4 changed files with 23 additions and 82 deletions

View File

@@ -978,12 +978,13 @@ class Dense(DenseBase):
@multithread(self.n_threads, N * self.d)
def _(base, size):
X_sub = sfix.Matrix(N * self.d, self.d_out, address=f_schur_Y.address)
offset = regint.inc(size, base=base)
result_matrix.assign_part_vector(
X_sub.direct_mul_trans(self.W, indices=(
offset, regint.inc(self.d_out),
regint.inc(self.d_out), regint.inc(self.d_in))), base)
X_sub.direct_mul_trans(self.W, indices=(regint.inc(size, base=base),
regint.inc(self.d_out),
regint.inc(self.d_out),
regint.inc(self.d_in))),
base)
if self.print_random_update:
print_ln('backward %s', self)
@@ -995,7 +996,6 @@ class Dense(DenseBase):
self.backward_params(f_schur_Y, batch=batch)
class QuantizedDense(DenseBase):
def __init__(self, N, d_in, d_out):
self.N = N
@@ -1048,64 +1048,7 @@ class Dropout(NoVariableLayer):
""" Dropout layer.
:param N: number of examples
:param d1: total dimension
:param alpha: probability (power of two)
"""
def __init__(self, N, d1, d2=1, alpha=0.5):
self.N = N
self.d1 = d1
self.d2 = d2
self.X = Tensor([N, d1, d2], sfix)
self.Y = Tensor([N, d1, d2], sfix)
self.nabla_Y = Tensor([N, d1, d2], sfix)
self.nabla_X = Tensor([N, d1, d2], sfix)
self.alpha = alpha
self.B = MultiArray([N, d1, d2], sint)
def __repr__(self):
return '%s(%s, %s, alpha=%s)' % \
(type(self).__name__, self.N, self.d1, self.alpha)
def forward(self, batch, training=False):
if training:
n_bits = -math.log(self.alpha, 2)
assert n_bits == int(n_bits)
n_bits = int(n_bits)
# self.B.assign_all(1) # TODO: temp disable for reproducibility
# self.alpha = 0.0 # TODO: temp disable for reproducibility
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
size = self.d1 * self.d2
self.B[i].assign_vector(util.tree_reduce(
util.or_op, (sint.get_random_bit(size=size)
for i in range(n_bits))))
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
self.Y[i].assign_vector(1 / (1 - self.alpha) *
self.X[batch[i]].get_vector() * self.B[i].get_vector())
else:
@for_range(len(batch))
def _(i):
self.Y[i] = self.X[batch[i]]
if self.debug_output:
print_ln('dropout X %s', self.X.reveal_nested())
print_ln('dropout Y %s', self.Y.reveal_nested())
def backward(self, compute_nabla_X=True, batch=None):
if compute_nabla_X:
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
self.nabla_X[batch[i]].assign_vector(
self.nabla_Y[i].get_vector() * self.B[i].get_vector())
if self.debug_output:
print_ln('dropout nabla_Y %s', self.nabla_Y.reveal_nested())
print_ln('dropout nabla_X %s', self.nabla_X.reveal_nested())
class FlexDropout(NoVariableLayer):
""" Dropout layer.
:param N: number of examples
:param d1: total dimension
:param shape: list [N, ...] where N is the number of examples and an arbitrary amount of further dimensions
:param alpha: probability (power of two)
"""
def __init__(self, shape, alpha=0.5):
@@ -1126,8 +1069,6 @@ class FlexDropout(NoVariableLayer):
n_bits = -math.log(self.alpha, 2)
assert n_bits == int(n_bits)
n_bits = int(n_bits)
# self.B.assign_all(1)
# self.alpha = 0.0 # TODO: temp disable for reproducibility
@for_range_opt_multithread(self.n_threads, len(batch))
def _(i):
size = reduce(operator.mul, self.shape[1:])
@@ -2984,7 +2925,7 @@ class BertOutput(BertBase):
super(BertOutput, self).__init__(input_shape, output_shape)
self.dense = Dense(n_examples, intermediate_size, hidden_size, seq_len)
self.layer_norm = LayerNorm(output_shape, layernorm_eps=layernorm_eps, approx=rsqrt_approx)
self.dropout = FlexDropout([n_examples, seq_len, hidden_size], alpha=dropout)
self.dropout = Dropout([n_examples, seq_len, hidden_size], alpha=dropout)
def forward(self, batch, input_tensor, training=False, input_tensor_batch=None):
@@ -3076,7 +3017,7 @@ class MultiHeadAttention(BertBase):
self.wq = Dense(n_examples, hidden_size, self.all_head_size, self.seq_len)
self.wk = Dense(n_examples, hidden_size, self.all_head_size, self.seq_len)
self.wv = Dense(n_examples, hidden_size, self.all_head_size, self.seq_len)
self.dropout = FlexDropout([internal_shape, self.num_attention_heads, self.seq_len, self.seq_len], alpha=dropout) # I think? # TODO: DROPOUT?
self.dropout = Dropout([internal_shape, self.num_attention_heads, self.seq_len, self.seq_len], alpha=dropout) # I think? # TODO: DROPOUT?
self.output = BertOutput(internal_shape, hidden_size, hidden_size, seq_len, dropout, layernorm_eps, rsqrt_approx)
self.context = sfix.Tensor([internal_shape, self.seq_len, hidden_size])
@@ -4306,8 +4247,8 @@ class keras:
layers.append(FixAveragePool2d(input_shape, None, **layer[1]))
input_shape = layers[-1].Y.sizes
elif name == 'dropout':
layers.append(Dropout(batch_size, reduce(
operator.mul, layers[-1].Y.sizes[1:]),
layers.append(Dropout([batch_size] + [reduce(
operator.mul, layers[-1].Y.sizes[1:])],
alpha=layer[1]))
input_shape = layers[-1].Y.sizes
elif name == 'flatten':
@@ -4544,7 +4485,7 @@ def layers_from_torch(model, data_input_shape, batch_size, input_via=None,
if alpha == 0.1:
print('WARNING: dropout rate 0.1 not supported, using 0.125')
alpha = 0.125
layers.append(Dropout(input_shape[0], mul(layers[-1].Y.sizes[1:]),
layers.append(Dropout([input_shape[0]] + list(layers[-1].Y.sizes[1:]),
alpha=alpha))
input_shape = layers[-1].Y.sizes
elif name == 'BertForSequenceClassification':

View File

@@ -77,16 +77,16 @@ if 'batchnorm' in program.args:
if 'dropout' in program.args:
for i in range(len(layers) - 1, 0, -1):
layers.insert(i, ml.Dropout(N, n_inner))
layers.insert(i, ml.Dropout([N, n_inner]))
if 'dropout-late' in program.args:
layers.insert(-1, ml.Dropout(N, n_inner))
layers.insert(-1, ml.Dropout([N, n_inner]))
if 'dropout-early' in program.args:
layers.insert(0, ml.Dropout(n_examples, n_features))
layers.insert(0, ml.Dropout([n_examples, n_features]))
if 'dropout-early.25' in program.args:
layers.insert(0, ml.Dropout(n_examples, n_features, alpha=.25))
layers.insert(0, ml.Dropout([n_examples, n_features], alpha=.25))
layers += [ml.MultiOutput.from_args(program, n_examples, 10)]

View File

@@ -73,16 +73,16 @@ if 'batchnorm' in program.args:
layers.insert(1, ml.BatchNorm([N, 24, 24, 20], args=program.args))
if 'dropout' in program.args or 'dropout2' in program.args:
layers.insert(8, ml.Dropout(N, 500))
layers.insert(8, ml.Dropout([N, 500]))
elif 'dropout.25' in program.args:
layers.insert(8, ml.Dropout(N, 500, alpha=0.25))
layers.insert(8, ml.Dropout([N, 500], alpha=0.25))
elif 'dropout.125' in program.args:
layers.insert(8, ml.Dropout(N, 500, alpha=0.125))
layers.insert(8, ml.Dropout([N, 500], alpha=0.125))
if 'dropout2' in program.args:
layers.insert(6, ml.Dropout(N, 800, alpha=0.125))
layers.insert(6, ml.Dropout([N, 800], alpha=0.125))
elif 'dropout1' in program.args:
layers.insert(6, ml.Dropout(N, 800, alpha=0.5))
layers.insert(6, ml.Dropout([N, 800], alpha=0.5))
if 'no_relu' in program.args:
for x in layers:

View File

@@ -79,18 +79,18 @@ dropout = 'dropout' in program.args
if '1dense' in program.args:
if dropout:
layers += [ml.Dropout(N, n_inner)]
layers += [ml.Dropout([N, n_inner])]
layers += [ml.Dense(N, n_inner, 10),]
elif '2dense' in program.args:
if dropout:
layers += [ml.Dropout(N, n_inner)]
layers += [ml.Dropout([N, n_inner])]
layers += [
ml.Dense(N, n_inner, 100),
ml.Relu([N, 100]),
ml.Dense(N, 100, 10),
]
if dropout or 'dropout1' in program.args:
layers.insert(-1, ml.Dropout(N, 100))
layers.insert(-1, ml.Dropout([N, 100]))
else:
raise Exception('need to specify number of dense layers')