ARM support.

2026-01-09 13:37:58 -05:00 · 2021-04-19 21:26:54 +10:00
parent 6c89808733
commit 0f656fa7b7
112 changed files with 1736 additions and 475 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "Programs/Circuits"]
 	path = Programs/Circuits
 	url = https://github.com/mkskeller/bristol-fashion
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde
--- a/BMR/Key.h
+++ b/BMR/Key.h
@@ -7,11 +7,10 @@
 #define COMMON_INC_KEY_H_

 #include <iostream>
-#include <emmintrin.h>
-#include <smmintrin.h>
 #include <string.h>

 #include "Tools/FlexBuffer.h"
+#include "Tools/intrinsics.h"
 #include "Math/gf2nlong.h"

 using namespace std;
--- a/BMR/Party.cpp
+++ b/BMR/Party.cpp
@@ -371,7 +371,7 @@ void FakeProgramParty::receive_spdz_wires(ReceivedMsg& msg)
 		spdz_mac_key.unpack(spdz_wires[op].back());
 		if (!MC)
 		{
-			MC = new Passing_MAC_Check<Share<gf2n_long>>(spdz_mac_key);
+			MC = new MAC_Check_<Share<gf2n_long>>(spdz_mac_key);
 			cout << "MAC key: " << hex << spdz_mac_key << endl;
 			mac_key = spdz_mac_key;
 		}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 The changelog explains changes pulled through from the private development repository. Bug fixes and small enhancements are committed between releases and not documented here.

+## 0.2.4 (Apr 19, 2021)
+
+- ARM support
+- Base OTs optionally without SimpleOT/AVX
+- Use OpenSSL instead of Crypto++ for elliptic curves
+- Post-sacrifice binary computation with replicated secret sharing similar
+  to [Araki et al.](https://www.ieee-security.org/TC/SP2017/papers/96.pdf)
+- More flexible multithreading
+
 ## 0.2.3 (Feb 23, 2021)

 - Distributed key generation for homomorphic encryption with active security similar to [Rotaru et al.](https://eprint.iacr.org/2019/1300)
--- a/32
+++ b/32
@@ -3,7 +3,6 @@ ROOT = .
 OPTIM= -O3
 #PROF = -pg
 #DEBUG = -DDEBUG
-#MEMPROTECT = -DMEMPROTECT
 GDEBUG = -g

 # set this to your preferred local storage directory
@@ -12,8 +11,8 @@ PREP_DIR = '-DPREP_DIR="Player-Data/"'
 # set for SHE preprocessing (SPDZ and Overdrive)
 USE_NTL = 0

-# set for using GF(2^128) online phase, OT, MASCOT, or BMR
-# unset for GF(2^40) online and offline phase
+# set for using GF(2^128)
+# unset for GF(2^40)
 USE_GF2N_LONG = 1

 # set to -march=<architecture> for optimization
@@ -28,6 +27,24 @@ USE_GF2N_LONG = 1
 ARCH = -mtune=native -msse4.1 -msse4.2 -maes -mpclmul -mavx -mavx2 -mbmi2 -madx
 ARCH = -march=native

+MACHINE := $(shell uname -m)
+OS := $(shell uname -s)
+ifeq ($(MACHINE), x86_64)
+# set this to 0 to avoid using AVX for OT
+ifeq ($(OS), Linux)
+CHECK_AVX := $(shell grep -q avx /proc/cpuinfo; echo $$?)
+ifeq ($(CHECK_AVX), 0)
+AVX_OT = 1
+else
+AVX_OT = 0
+endif
+else
+AVX_OT = 1
+endif
+else
+AVX_OT = 0
+endif
+
 # allow to set compiler in CONFIG.mine
 CXX = g++

@@ -38,6 +55,10 @@ ifeq ($(USE_GF2N_LONG),1)
 GF2N_LONG = -DUSE_GF2N_LONG
 endif

+ifeq ($(AVX_OT), 0)
+CFLAGS += -DNO_AVX_OT
+endif
+
 # MAX_MOD_SZ (for FHE) must be least and GFP_MOD_SZ (for computation)
 # must be exactly ceil(len(p)/len(word)) for the relevant prime p
 # GFP_MOD_SZ only needs to be set for primes of bit length more that 256.
@@ -51,7 +72,6 @@ ifeq ($(USE_NTL),1)
 LDLIBS := -lntl $(LDLIBS)
 endif

-OS := $(shell uname -s)
 ifeq ($(OS), Linux)
 LDLIBS += -lrt
 endif
@@ -62,12 +82,10 @@ else
 BOOST = -lboost_thread $(MY_BOOST)
 endif

-CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(MEMPROTECT) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
+CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
 CPPFLAGS = $(CFLAGS)
 LD = $(CXX)

-ECLIB = -lcryptopp
-
 ifeq ($(OS), Darwin)
 ifeq ($(USE_NTL),1)
 CFLAGS += -Wno-error=unused-parameter
--- a/Compiler/GC/types.py
+++ b/Compiler/GC/types.py
@@ -284,7 +284,7 @@ class sbits(bits):
    Instances can be also be initalized from :py:obj:`~Compiler.types.regint`
    and :py:obj:`~Compiler.types.sint`.
    """
-    max_length = 128
+    max_length = 64
    reg_type = 'sb'
    is_clear = False
    clear_type = cbits
--- a/Compiler/comparison.py
+++ b/Compiler/comparison.py
@@ -190,6 +190,8 @@ def TruncLeakyInRing(a, k, m, signed):
    Returns a >> m.
    Requires a < 2^k and leaks a % 2^m (needs to be constant or random).
    """
+    if k == m:
+        return 0
    assert k > m
    assert int(program.options.ring) >= k
    from .types import sint, intbitint, cint, cgf2n
--- a/Compiler/dijkstra.py
+++ b/Compiler/dijkstra.py
@@ -103,7 +103,7 @@ class HeapQ(object):
        childpos = MemValue(start * shift)
        @for_range(self.levels - 1)
        def f(i):
-            parentpos = childpos.right_shift(1, self.levels)
+            parentpos = childpos.right_shift(1, self.levels + 1)
            parent, parent_state = self.heap.read_and_maybe_remove(parentpos)
            child, child_state = self.heap.read_and_maybe_remove(childpos)
            swap = parent > child
--- a/Compiler/floatingpoint.py
+++ b/Compiler/floatingpoint.py
@@ -1,3 +1,4 @@
+import math
 from math import log, floor, ceil
 from Compiler.instructions import *
 from . import types
@@ -411,6 +412,8 @@ def TruncInRing(to_shift, l, pow2m):
    return types.sint.bit_compose(reversed(bits))

 def SplitInRing(a, l, m):
+    if l == 1:
+        return m.if_else(a, 0), m.if_else(0, a), 1
    pow2m = Pow2(m, l, None)
    upper = TruncInRing(a, l, pow2m)
    lower = a - upper * pow2m
@@ -620,27 +623,36 @@ def BITLT(a, b, bit_length):
 def BitDecFull(a):
    from .library import get_program, do_while, if_, break_point
    from .types import sint, regint, longint
-    p=int(get_program().options.prime)
+    p = get_program().prime
    assert p
    bit_length = p.bit_length()
-    bbits = [sint(size=a.size) for i in range(bit_length)]
-    tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
-    pbits = util.bit_decompose(p)
-    # Loop until we get some random integers less than p
-    done = [regint(0) for i in range(a.size)]
-    @do_while
-    def get_bits_loop():
+    logp = int(round(math.log(p, 2)))
+    if abs(p - 2 ** logp) / p < 2 ** -get_program().security:
+        # inspired by Rabbit (https://eprint.iacr.org/2021/119)
+        # no need for exact randomness generation
+        # if modulo a power of two is close enough
+        bbits = [sint.get_random_bit(size=a.size) for i in range(logp)]
+        if logp != bit_length:
+            bbits += [sint(0, size=a.size)]
+    else:
+        bbits = [sint(size=a.size) for i in range(bit_length)]
+        tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
+        pbits = util.bit_decompose(p)
+        # Loop until we get some random integers less than p
+        done = [regint(0) for i in range(a.size)]
+        @do_while
+        def get_bits_loop():
+            for j in range(a.size):
+                @if_(done[j] == 0)
+                def _():
+                    for i in range(bit_length):
+                        tbits[j][i].link(sint.get_random_bit())
+                    c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
+                    done[j].link(c)
+            return (sum(done) != a.size)
        for j in range(a.size):
-            @if_(done[j] == 0)
-            def _():
-                for i in range(bit_length):
-                    tbits[j][i].link(sint.get_random_bit())
-                c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
-                done[j].link(c)
-        return (sum(done) != a.size)
-    for j in range(a.size):
-        for i in range(bit_length):
-            movs(bbits[i][j], tbits[j][i])
+            for i in range(bit_length):
+                movs(bbits[i][j], tbits[j][i])
    b = sint.bit_compose(bbits)
    c = (a-b).reveal()
    t = (p-c).bit_decompose(bit_length)
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -1577,19 +1577,6 @@ class writesocketc(base.IOInstruction):
    def has_var_args(self):
        return True

-@base.vectorize
-class writesockets(base.IOInstruction):
-    """
-    Write a variable number of secret shares + MACs from registers into a socket
-    for a specified client id, message_type
-    """
-    __slots__ = []
-    code = base.opcodes['WRITESOCKETS']
-    arg_format = tools.chain(['ci', 'int'], itertools.repeat('s'))
-
-    def has_var_args(self):
-        return True
-
@base.vectorize
 class writesocketshare(base.IOInstruction):
    """ Write a variable number of shares (without MACs) from secret
--- a/Compiler/instructions_base.py
+++ b/Compiler/instructions_base.py
@@ -903,7 +903,7 @@ class DirectMemoryWriteInstruction(DirectMemoryInstruction, \
                                       WriteMemoryInstruction):
    __slots__ = []
    def __init__(self, *args, **kwargs):
-        if program.curr_tape.prevent_direct_memory_write:
+        if not program.curr_tape.singular:
            raise CompilerError('Direct memory writing prevented in threads')
        super(DirectMemoryWriteInstruction, self).__init__(*args, **kwargs)

--- a/Compiler/library.py
+++ b/Compiler/library.py
@@ -1062,14 +1062,14 @@ def for_range_opt_multithread(n_threads, n_loops):
    """
    return for_range_multithread(n_threads, None, n_loops)

-def multithread(n_threads, n_items, max_size=None):
+def multithread(n_threads, n_items=None, max_size=None):
    """
    Distribute the computation of :py:obj:`n_items` to
    :py:obj:`n_threads` threads, but leave the in-thread repetition up
    to the user.

    :param n_threads: compile-time (int)
-    :param n_items: regint/cint/int
+    :param n_items: regint/cint/int (default: :py:obj:`n_threads`)

    The following executes ``f(0, 8)``, ``f(8, 8)``, and
    ``f(16, 9)`` in three different threads:
@@ -1080,6 +1080,8 @@ def multithread(n_threads, n_items, max_size=None):
        def f(base, size):
            ...
    """
+    if n_items is None:
+        n_items = n_threads
    if max_size is None:
        return map_reduce(n_threads, None, n_items, initializer=lambda: [],
                          reducer=None, looping=False)
--- a/Compiler/ml.py
+++ b/Compiler/ml.py
@@ -703,6 +703,9 @@ class Dense(DenseBase):
        progress('f input')

    def forward(self, batch=None):
+        if batch is None:
+            batch = regint.Array(self.N)
+            batch.assign(regint.inc(self.N))
        self.compute_f_input(batch=batch)
        if self.activation_layer:
            self.activation_layer.forward(batch)
--- a/Compiler/oram.py
+++ b/Compiler/oram.py
@@ -91,7 +91,11 @@ class intBlock(Block):
            for length,start in zip(self.lengths[:-1],series(self.lengths)):
                res.append(remainder.mod2m(length, total_length - start, False))
                remainder -= res[-1]
-                remainder /= floatingpoint.two_power(length)
+                if Program.prog.options.ring:
+                    remainder = remainder.trunc_zeros(length,
+                                                      total_length - start, False)
+                else:
+                    remainder /= floatingpoint.two_power(length)
            res.append(remainder)
            return res
    def set_slice(self, value):
@@ -1498,12 +1502,12 @@ class PackedIndexStructure(object):
            rem = mod2m(index, self.log_entries_per_block, log2(self.size), False)
            c = mod2m(rem, self.log_entries_per_element, \
                          self.log_entries_per_block, False)
-            b = (rem - c).trunc_zeros(self.log_entries_per_element,
+            b = trunc_zeros(rem - c, self.log_entries_per_element,
                                      self.log_entries_per_block)
            if self.small:
                return 0, b, c
            else:
-                return (index - rem).trunc_zeros(self.log_entries_per_block,
+                return trunc_zeros(index - rem, self.log_entries_per_block,
                                                 log2(self.size)), b, c
        else:
            index_bits = bit_decompose(index, log2(self.size))
--- a/Compiler/program.py
+++ b/Compiler/program.py
@@ -118,7 +118,6 @@ class Program(object):
        self.req_num = None
        self.tape_stack = []
        self.n_threads = 1
-        self.free_threads = set()
        self.public_input_file = None
        self.types = {}
        self.budget = int(self.options.budget)
@@ -206,6 +205,28 @@ class Program(object):
        self.progname = progname

    def new_tape(self, function, args=[], name=None, single_thread=False):
+        """
+        Create a new tape from a function. See
+        :py:func:`~Compiler.library.multithread` and
+        :py:func:`~Compiler.library.for_range_opt_multithread` for
+        easier-to-use higher-level functionality. The following runs
+        two threads defined by two different functions::
+
+            def f():
+                ...
+            def g():
+                ...
+            tapes = [program.new_tape(x) for x in (f, g)]
+            thread_numbers = program.run_tapes(tapes)
+            program.join_tapes(threads_numbers)
+
+        :param function: Python function defining the thread
+        :param args: arguments to the function
+        :param name: name used for files
+        :param single_thread: Boolean indicating whether tape will never be run in parallel to itself
+        :returns: tape handle
+
+        """
        if name is None:
            name = function.__name__
        name = "%s-%s" % (self.name, name)
@@ -214,7 +235,7 @@ class Program(object):
        tape_index = len(self.tapes)
        self.tape_stack.append(self.curr_tape)
        self.curr_tape = Tape(name, self)
-        self.curr_tape.prevent_direct_memory_write = not single_thread
+        self.curr_tape.singular = single_thread
        self.tapes.append(self.curr_tape)
        function(*args)
        self.finalize_tape(self.curr_tape)
@@ -226,14 +247,31 @@ class Program(object):
        return self.run_tapes([[tape_index, arg]])[0]

    def run_tapes(self, args):
-        if self.curr_tape is not self.tapes[0]:
+        """ Run tapes in parallel. See :py:func:`new_tape` for an example.
+
+        :param args: list of tape handles or tuples of tape handle and extra argument (for :py:func:`~Compiler.library.get_arg`)
+        :returns: list of thread numbers
+        """
+        if not self.curr_tape.singular:
            raise CompilerError('Compiler does not support ' \
                                    'recursive spawning of threads')
+        args = [list(util.tuplify(arg)) for arg in args]
+        singular_tapes = set()
+        for arg in args:
+            if self.tapes[arg[0]].singular:
+                if arg[0] in singular_tapes:
+                    raise CompilerError('cannot run singular tape in parallel')
+                singular_tapes.add(arg[0])
+            assert len(arg)
+            assert len(arg) <= 2
+            if len(arg) == 1:
+                arg += [0]
        thread_numbers = []
        while len(thread_numbers) < len(args):
-            if self.free_threads:
-                thread_numbers.append(min(self.free_threads))
-                self.free_threads.remove(thread_numbers[-1])
+            free_threads = self.curr_tape.free_threads
+            if free_threads:
+                thread_numbers.append(min(free_threads))
+                free_threads.remove(thread_numbers[-1])
            else:
                thread_numbers.append(self.n_threads)
                self.n_threads += 1
@@ -247,10 +285,18 @@ class Program(object):
        return thread_numbers

    def join_tape(self, thread_number):
+        self.join_tapes([thread_number])
+
+    def join_tapes(self, thread_numbers):
+        """ Wait for completion of tapes.  See :py:func:`new_tape` for an example.
+
+        :param thread_numbers: list of thread numbers
+        """
        self.curr_tape.start_new_basicblock(name='pre-join_tape')
-        Compiler.instructions.join_tape(thread_number)
+        for thread_number in thread_numbers:
+            Compiler.instructions.join_tape(thread_number)
+            self.curr_tape.free_threads.add(thread_number)
        self.curr_tape.start_new_basicblock(name='post-join_tape')
-        self.free_threads.add(thread_number)

    def update_req(self, tape):
        if self.req_num is None:
@@ -259,6 +305,7 @@ class Program(object):
            self.req_num += tape.req_num
    
    def write_bytes(self):
+
        """ Write all non-empty threads and schedule to files. """

        nonempty_tapes = [t for t in self.tapes]
@@ -312,7 +359,7 @@ class Program(object):
        """ Allocate memory from the top """
        if not isinstance(size, int):
            raise CompilerError('size must be known at compile time')
-        if (creator_tape or self.curr_tape) != self.tapes[0]:
+        if not (creator_tape or self.curr_tape).singular:
            raise CompilerError('cannot allocate memory outside main thread')
        if size == 0:
            return
@@ -510,7 +557,8 @@ class Tape:
        self.req_bit_length = defaultdict(lambda: 0)
        self.function_basicblocks = {}
        self.functions = []
-        self.prevent_direct_memory_write = False
+        self.singular = True
+        self.free_threads = set()

    class BasicBlock(object):
        def __init__(self, parent, name, scope, exit_condition=None):
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -21,7 +21,13 @@ Basic types
 -----------

 Basic types contain many special methods such as :py:func:`__add__`. This is
-used for operator overloading in Python. In some operations such as
+used for operator overloading in Python. It is not recommend to use
+them, use the plain operators instead, such as ``+`` instead of
+:py:func:`__add__`. See
+https://docs.python.org/3/reference/datamodel.html#special-method-names
+for a translation to operators.
+
+In some operations such as
 secure comparison, the secure computation protocols allows for more
 parameters than just the operands which influence the performance. In
 this case, we provide an alias for better code readability. For
@@ -780,7 +786,12 @@ class cint(_clear, _int):

    @vectorized_classmethod
    def read_from_socket(cls, client_id, n=1):
-        """ Read a list of clear values from socket. """
+        """ Receive clear value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: cint (if n=1) or list of cint
+        """
        res = [cls() for i in range(n)]
        readsocketc(client_id, *res)
        if n == 1:
@@ -790,7 +801,11 @@ class cint(_clear, _int):

    @vectorized_classmethod
    def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of clear values to socket """
+        """ Send a list of clear values to a client.
+
+        :param client_id: Client id (regint)
+        :param values: list of cint
+        """
        writesocketc(client_id, message_type, *values)

    @vectorized_classmethod
@@ -1207,7 +1222,12 @@ class regint(_register, _int):

    @vectorized_classmethod
    def read_from_socket(cls, client_id, n=1):
-        """ Receive n register values from socket """
+        """ Receive clear integer value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: regint (if n=1) or list of regint
+        """
        res = [cls() for i in range(n)]
        readsocketint(client_id, *res)
        if n == 1:
@@ -1217,7 +1237,11 @@ class regint(_register, _int):

    @vectorized_classmethod
    def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of integers to socket """
+        """ Send a list of clear integers to a client.
+
+        :param client_id: Client id (regint)
+        :param values: list of regint
+        """
        writesocketint(client_id, message_type, *values)

    @vectorize_init
@@ -1805,6 +1829,14 @@ class sint(_secret, _int):
    PreOR = staticmethod(floatingpoint.PreOR)
    get_type = staticmethod(lambda n: sint)

+    @staticmethod
+    def require_bit_length(n_bits):
+        if program.options.ring:
+            if int(program.options.ring) < n_bits:
+                raise CompilerError('computation modulus too small')
+        else:
+            program.curr_tape.require_bit_length(n_bits)
+
    @vectorized_classmethod
    def get_random_int(cls, bits):
        """ Secret random n-bit number according to security model.
@@ -1906,7 +1938,12 @@ class sint(_secret, _int):

    @vectorized_classmethod
    def read_from_socket(cls, client_id, n=1):
-        """ Receive n shares and MAC shares from socket """
+        """ Receive secret-shared value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: sint (if n=1) or list of sint
+        """
        res = [cls() for i in range(n)]
        readsockets(client_id, *res)
        if n == 1:
@@ -1914,27 +1951,46 @@ class sint(_secret, _int):
        else:
            return res

-    @vectorized_classmethod
-    def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of shares and MAC shares to socket """
-        writesockets(client_id, message_type, *values)
-
    @vectorize
    def write_share_to_socket(self, client_id, message_type=ClientMessageType.NoType):
        """ Send only share to socket """
        writesocketshare(client_id, message_type, self)

    @vectorized_classmethod
-    def write_shares_to_socket(cls, client_id, values, message_type=ClientMessageType.NoType, include_macs=False):
+    def write_shares_to_socket(cls, client_id, values,
+                               message_type=ClientMessageType.NoType):
        """ Send shares of a list of values to a specified client socket.

        :param client_id: regint
        :param values: list of sint
        """
-        if include_macs:
-            writesockets(client_id, message_type, *values)
-        else:
-            writesocketshare(client_id, message_type, *values)
+        writesocketshare(client_id, message_type, *values)
+
+    @classmethod
+    def read_from_file(cls, start, n_items):
+        """ Read shares from ``Persistence/Transactions-P<playerno>.data``.
+
+        :param start: starting position in number of shares from beginning (int/regint/cint)
+        :param n_items: number of items (int)
+        :returns: destination for final position, -1 for eof reached, or -2 for file not found (regint)
+        :returns: list of shares
+        """
+        shares = [cls(size=1) for i in range(n_items)]
+        stop = regint()
+        readsharesfromfile(regint.conv(start), stop, *shares)
+        return stop, shares
+
+    @staticmethod
+    def write_to_file(shares):
+        """ Write shares to ``Persistence/Transactions-P<playerno>.data``
+        (appending at the end).
+
+        :param: shares (list or iterable of sint)
+        """
+        for share in shares:
+            assert isinstance(share, sint)
+            assert share.size == 1
+        writesharestofile(*shares)

    @vectorized_classmethod
    def load_mem(cls, address, mem_type=None):
@@ -2920,8 +2976,14 @@ class cfix(_number, _structure):

    @vectorized_classmethod
    def read_from_socket(cls, client_id, n=1):
-        """ Read one or more cfix values from a socket. 
-            Sender will have already bit shifted and sent as cints."""
+        """
+        Receive clear fixed-point value(s) from client. The client needs
+        to convert the values to the right integer representation.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: cfix (if n=1) or list of cfix
+        """
        cint_input = cint.read_from_socket(client_id, n)
        if n == 1:
            return cfix._new(cint_inputs)
@@ -2930,7 +2992,12 @@ class cfix(_number, _structure):
        
    @vectorized_classmethod
    def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of cfix values to socket. Values are sent as bit shifted cints. """
+        """ Send a list of clear fixed-point values to a client
+        (represented as clear integers).
+
+        :param client_id: Client id (regint)
+        :param values: list of cint
+        """
        def cfix_to_cint(fix_val):
            return cint(fix_val.v)
        cint_values = list(map(cfix_to_cint, values))
@@ -3182,15 +3249,8 @@ class cfix(_number, _structure):

    def print_plain(self):
        """ Clear fixed-point output. """
-        if self.k > 64:
-            sign = 1 - (((self.v + (1 << (self.k - 1))) >> (self.k - 1)) & 1)
-        else:
-            tmp = regint()
-            convmodp(tmp, self.v, bitlength=self.k)
-            sign = cint(tmp < 0)
-        abs_v = sign.if_else(-self.v, self.v)
-        print_float_plain(cint(abs_v), cint(-self.f), \
-                          cint(0), cint(sign), cint(0))
+        print_float_plain(cint.conv(self.v), cint(-self.f), \
+                          cint(0), cint(0), cint(0))

    def output_if(self, cond):
        cond_print_plain(cond, self.v, cint(-self.f))
@@ -3206,8 +3266,14 @@ class _single(_number, _structure):

    @classmethod
    def receive_from_client(cls, n, client_id, message_type=ClientMessageType.NoType):
-        """ Securely obtain shares of n values input by a client.
-            Assumes client has already run bit shift to convert fixed point to integer."""
+        """
+        Securely obtain shares of values input by a client. Assumes client
+        has already converted values to integer representation.
+
+        :param n: number of inputs (int)
+        :param client_id: regint
+
+        """
        sint_inputs = cls.int_type.receive_from_client(n, client_id, ClientMessageType.TripleShares)
        return list(map(cls, sint_inputs))

@@ -3574,6 +3640,7 @@ class sfix(_fix):
        """ Secret fixed-point input.

        :param player: public (regint/cint/int) """
+        cls.int_type.require_bit_length(cls.k)
        v = cls.int_type()
        inputmixed('fix', v, cls.f, player)
        return cls._new(v)
@@ -4486,7 +4553,7 @@ class Array(object):
                raise CompilerError('cannot assign vector to all elements')
        mem_value = MemValue(value)
        self.address = MemValue.if_necessary(self.address)
-        n_threads = 8 if use_threads and len(self) > 2**20 else 1
+        n_threads = 8 if use_threads and len(self) > 2**20 else None
        @library.for_range_multithread(n_threads, 1024, len(self))
        def f(i):
            self[i] = mem_value
--- a/Compiler/util.py
+++ b/Compiler/util.py
@@ -40,6 +40,12 @@ def mod2m(a, b, bits, signed):
    else:
        return a.mod2m(b, bits, signed=signed)

+def trunc_zeros(a, n_zeros, bit_length=None):
+    if isinstance(a, int):
+        return a >> n_zeros
+    else:
+        return a.trunc_zeros(n_zeros, bit_length)
+
 def right_shift(a, b, bits):
    if isinstance(a, int):
        return a >> b
--- a/ECDSA/CurveElement.cpp
+++ b/ECDSA/CurveElement.cpp
@@ -0,0 +1,142 @@
+/*
+ * Element.cpp
+ *
+ */
+
+#include <ECDSA/CurveElement.h>
+
+#include "Math/gfp.hpp"
+
+unsigned char CurveElement::zero[crypto_core_ristretto255_BYTES];
+
+void CurveElement::init()
+{
+    Scalar::init_field(
+            (bigint(1) << 252) + bigint("27742317777372353535851937790883648493"),
+            false);
+    if(sodium_init() == -1)
+        throw runtime_error("cannot initalize sodium");
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    memset(tmp, 0, sizeof(tmp));
+    crypto_scalarmult_ristretto255_base(zero, tmp);
+}
+
+void CurveElement::convert(unsigned char* res, const Scalar& other)
+{
+    bigint tmp;
+    tmp = other;
+    assert(tmp.__get_mp()->_mp_size * sizeof(mp_limb_t) <= crypto_core_ristretto255_SCALARBYTES);
+    memset(res, 0, crypto_core_ristretto255_SCALARBYTES);
+    memcpy(res, tmp.__get_mp()->_mp_d, abs(tmp.__get_mp()->_mp_size) * sizeof(mp_limb_t));
+}
+
+CurveElement::CurveElement()
+{
+    memcpy(a, zero, sizeof(a));
+    check();
+}
+
+CurveElement::CurveElement(const Scalar& other)
+{
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    convert(tmp, other);
+    crypto_scalarmult_ristretto255_base(a, tmp);
+    check();
+}
+
+CurveElement::CurveElement(word other)
+{
+    if (other == 0)
+    {
+        *this = CurveElement();
+        return;
+    }
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    memset(tmp, 0, sizeof(tmp));
+    memcpy(tmp, &other, sizeof(other));
+    crypto_scalarmult_ristretto255_base(a, tmp);
+    check();
+}
+
+void CurveElement::check()
+{
+#ifdef CURVE_CHECK
+    if (crypto_core_ristretto255_is_valid_point(a) != 1)
+        throw runtime_error("curve point not valid");
+#endif
+}
+
+CurveElement CurveElement::operator +(const CurveElement& other) const
+{
+    CurveElement res;
+    crypto_core_ristretto255_add(res.a, a, other.a);
+    res.check();
+    return res;
+}
+
+CurveElement CurveElement::operator -(const CurveElement& other) const
+{
+    CurveElement res;
+    crypto_core_ristretto255_sub(res.a, a, other.a);
+    res.check();
+    return res;
+}
+
+CurveElement CurveElement::operator *(const Scalar& other) const
+{
+    CurveElement res;
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    convert(tmp, other);
+    if (crypto_scalarmult_ristretto255(res.a, tmp, a) < 0)
+    {
+        cerr << "EC multiplication by zero" << endl;
+    }
+    res.check();
+    return res;
+}
+
+CurveElement& CurveElement::operator +=(const CurveElement& other)
+{
+    *this = *this + other;
+    return *this;
+}
+
+bool CurveElement::operator ==(const CurveElement& other) const
+{
+    for (size_t i = 0; i < sizeof a; i++)
+        if (a[i] != other.a[i])
+            return false;
+    return true;
+}
+
+bool CurveElement::operator !=(const CurveElement& other) const
+{
+    return not (*this == other);
+}
+
+void CurveElement::pack(octetStream& os) const
+{
+    os.append(a, sizeof(a));
+}
+
+void CurveElement::unpack(octetStream& os)
+{
+    os.consume(a, sizeof(a));
+    check();
+}
+
+ostream& operator <<(ostream& s, const CurveElement& x)
+{
+    s << hex << *(word*)x.get();
+    return s;
+}
+
+octetStream CurveElement::hash(size_t n_bytes) const
+{
+    octetStream os;
+    pack(os);
+    auto res = os.hash();
+    assert(n_bytes >= res.get_length());
+    res.resize_precise(n_bytes);
+    return res;
+}
--- a/ECDSA/CurveElement.h
+++ b/ECDSA/CurveElement.h
@@ -0,0 +1,63 @@
+/*
+ * Element.h
+ *
+ */
+
+#ifndef ECDSA_CURVEELEMENT_H_
+#define ECDSA_CURVEELEMENT_H_
+
+#include <sodium.h>
+
+#include "Math/gfp.h"
+
+class CurveElement : public ValueInterface
+{
+public:
+    typedef gfp_<2, 4> Scalar;
+
+private:
+    static unsigned char zero[crypto_core_ristretto255_BYTES];
+
+    unsigned char a[crypto_core_ristretto255_BYTES];
+
+    static void convert(unsigned char* res, const Scalar& other);
+
+public:
+    typedef void next;
+    typedef void Square;
+
+    static int size() { return sizeof(a); }
+    static string type_string() { return "Curve25519"; }
+
+    static void init();
+
+    CurveElement();
+    CurveElement(const Scalar& other);
+    CurveElement(word other);
+
+    void check();
+
+    const unsigned char* get() const { return a; }
+
+    CurveElement operator+(const CurveElement& other) const;
+    CurveElement operator-(const CurveElement& other) const;
+    CurveElement operator*(const Scalar& other) const;
+
+    CurveElement& operator+=(const CurveElement& other);
+
+    bool operator==(const CurveElement& other) const;
+    bool operator!=(const CurveElement& other) const;
+
+    void assign_zero() { *this = 0; }
+    bool is_zero() { return *this == 0; }
+    void add(octetStream& os) { *this += os.get<CurveElement>(); }
+
+    void pack(octetStream& os) const;
+    void unpack(octetStream& os);
+
+    octetStream hash(size_t n_bytes) const;
+};
+
+ostream& operator<<(ostream& s, const CurveElement& x);
+
+#endif /* ECDSA_CURVEELEMENT_H_ */
--- a/ECDSA/P256Element.cpp
+++ b/ECDSA/P256Element.cpp
@@ -7,72 +7,127 @@

 #include "Math/gfp.hpp"

-#include <cryptopp/oids.h>
-#include <cryptopp/misc.h>
-
-CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP> P256Element::params;
-CryptoPP::ECP P256Element::curve;
+EC_GROUP* P256Element::curve;

 void P256Element::init()
 {
-    params = CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP>(CryptoPP::ASN1::secp256k1());
-    curve = params.GetCurve();
-    auto mod = params.GetSubgroupOrder();
-    Scalar::init_field(CryptoPP::IntToString(mod).c_str(), false);
-}
-
-CryptoPP::Integer P256Element::convert(const Scalar& other)
-{
-    return CryptoPP::Integer((unsigned char*) other.get_ptr(), other.size(),
-            CryptoPP::Integer::UNSIGNED, CryptoPP::LITTLE_ENDIAN_ORDER);
+    curve = EC_GROUP_new_by_curve_name(NID_secp256k1);
+    assert(curve != 0);
+    auto modulus = EC_GROUP_get0_order(curve);
+    Scalar::init_field(BN_bn2dec(modulus), false);
 }

 P256Element::P256Element()
 {
-    point = curve.Identity();
+    point = EC_POINT_new(curve);
+    assert(point != 0);
+    assert(EC_POINT_set_to_infinity(curve, point) != 0);
 }

-P256Element::P256Element(const Scalar& other)
+P256Element::P256Element(const Scalar& other) :
+        P256Element()
 {
-    point = params.ExponentiateBase(convert(other));
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, bigint(other).get_str().c_str());
+    assert(EC_POINTs_mul(curve, point, exp, 0, 0, 0, 0) != 0);
+    BN_free(exp);
 }

-P256Element::P256Element(word other)
+P256Element::P256Element(word other) :
+        P256Element()
 {
-    point = params.ExponentiateBase(other);
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, to_string(other).c_str());
+    assert(EC_POINTs_mul(curve, point, exp, 0, 0, 0, 0) != 0);
+    BN_free(exp);
+}
+
+P256Element& P256Element::operator =(const P256Element& other)
+{
+    assert(EC_POINT_copy(point, other.point) != 0);
+    return *this;
 }

 void P256Element::check()
 {
-    curve.VerifyPoint(point);
+    assert(EC_POINT_is_on_curve(curve, point, 0) == 1);
 }

 P256Element::Scalar P256Element::x() const
 {
-    return bigint(IntToString(point.x));
+    BIGNUM* x = BN_new();
+    assert(EC_POINT_get_affine_coordinates_GFp(curve, point, x, 0, 0) != 0);
+    char* xx = BN_bn2dec(x);
+    Scalar res((bigint(xx)));
+    OPENSSL_free(xx);
+    BN_free(x);
+    return res;
 }

 P256Element P256Element::operator +(const P256Element& other) const
 {
    P256Element res;
-    res.point = curve.Add(point, other.point);
+    assert(EC_POINT_add(curve, res.point, point, other.point, 0) != 0);
    return res;
 }

 P256Element P256Element::operator -(const P256Element& other) const
 {
-    P256Element res;
-    res.point = curve.Add(point, curve.Inverse(other.point));
-    return res;
+    P256Element tmp = other;
+    assert(EC_POINT_invert(curve, tmp.point, 0) != 0);
+    return *this + tmp;
 }

 P256Element P256Element::operator *(const Scalar& other) const
 {
    P256Element res;
-    res.point = curve.Multiply(convert(other), point);
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, bigint(other).get_str().c_str());
+    assert(EC_POINT_mul(curve, res.point, 0, point, exp, 0) != 0);
+    BN_free(exp);
    return res;
 }

+bool P256Element::operator ==(const P256Element& other) const
+{
+    int cmp = EC_POINT_cmp(curve, point, other.point, 0);
+    assert(cmp == 0 or cmp == 1);
+    return not cmp;
+}
+
+void P256Element::pack(octetStream& os) const
+{
+    octet* buffer;
+    size_t length = EC_POINT_point2buf(curve, point,
+            POINT_CONVERSION_COMPRESSED, &buffer, 0);
+    assert(length != 0);
+    os.store_int(length, 8);
+    os.append(buffer, length);
+}
+
+void P256Element::unpack(octetStream& os)
+{
+    size_t length = os.get_int(8);
+    assert(
+            EC_POINT_oct2point(curve, point, os.consume(length), length, 0)
+                    != 0);
+}
+
+ostream& operator <<(ostream& s, const P256Element& x)
+{
+    char* hex = EC_POINT_point2hex(x.curve, x.point,
+            POINT_CONVERSION_COMPRESSED, 0);
+    s << hex;
+    OPENSSL_free(hex);
+    return s;
+}
+
+P256Element::P256Element(const P256Element& other) :
+        P256Element()
+{
+    *this = other;
+}
+
 P256Element operator*(const P256Element::Scalar& x, const P256Element& y)
 {
    return y * x;
@@ -90,44 +145,17 @@ P256Element& P256Element::operator /=(const Scalar& other)
    return *this;
 }

-bool P256Element::operator ==(const P256Element& other) const
-{
-    return point == other.point;
-}
-
 bool P256Element::operator !=(const P256Element& other) const
 {
    return not (*this == other);
 }

-void P256Element::pack(octetStream& os) const
+octetStream P256Element::hash(size_t n_bytes) const
 {
-    os.serialize(point.identity);
-    size_t l;
-    l = point.x.MinEncodedSize();
-    os.serialize(l);
-    point.x.Encode(os.append(l), l);
-    l = point.y.MinEncodedSize();
-    os.serialize(l);
-    point.y.Encode(os.append(l), l);
-}
-
-void P256Element::unpack(octetStream& os)
-{
-    os.unserialize(point.identity);
-    size_t l;
-    os.unserialize(l);
-    point.x.Decode(os.consume(l), l);
-    os.unserialize(l);
-    point.y.Decode(os.consume(l), l);
-}
-
-ostream& operator <<(ostream& s, const P256Element& x)
-{
-    auto& point = x.get();
-    if (point.identity)
-        s << "ID" << endl;
-    else
-        s << point.x << "," << point.y;
-    return s;
+    octetStream os;
+    pack(os);
+    auto res = os.hash();
+    assert(n_bytes >= res.get_length());
+    res.resize_precise(n_bytes);
+    return res;
 }
--- a/ECDSA/P256Element.h
+++ b/ECDSA/P256Element.h
@@ -6,7 +6,8 @@
 #ifndef ECDSA_P256ELEMENT_H_
 #define ECDSA_P256ELEMENT_H_

-#include <cryptopp/eccrypto.h>
+#include <openssl/ec.h>
+#include <openssl/obj_mac.h>

 #include "Math/gfp.h"

@@ -16,12 +17,9 @@ public:
    typedef gfp_<2, 4> Scalar;

 private:
-    static CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP> params;
-    static CryptoPP::ECP curve;
+    static EC_GROUP* curve;

-    CryptoPP::ECP::Point point;
-
-    static CryptoPP::Integer convert(const Scalar& other);
+    EC_POINT* point;

 public:
    typedef void next;
@@ -35,13 +33,13 @@ public:
    static void init();

    P256Element();
+    P256Element(const P256Element& other);
    P256Element(const Scalar& other);
    P256Element(word other);

-    void check();
+    P256Element& operator=(const P256Element& other);

-    const CryptoPP::ECP::Point& get() const { return point; }
-//    const unsigned char* get() const { return a; }
+    void check();

    Scalar x() const;

@@ -55,16 +53,18 @@ public:
    bool operator==(const P256Element& other) const;
    bool operator!=(const P256Element& other) const;

-    void assign_zero() { *this = 0; }
-    bool is_zero() { return *this == 0; }
+    void assign_zero() { *this = {}; }
+    bool is_zero() { return *this == P256Element(); }
    void add(octetStream& os) { *this += os.get<P256Element>(); }

    void pack(octetStream& os) const;
    void unpack(octetStream& os);
+
+    octetStream hash(size_t n_bytes) const;
+
+    friend ostream& operator<<(ostream& s, const P256Element& x);
 };

 P256Element operator*(const P256Element::Scalar& x, const P256Element& y);

-ostream& operator<<(ostream& s, const P256Element& x);
-
 #endif /* ECDSA_P256ELEMENT_H_ */
--- a/ECDSA/README.md
+++ b/ECDSA/README.md
@@ -5,9 +5,6 @@ in `preprocessing.hpp` and `sign.hpp`, respectively.

 #### Compilation

- Add either `CXX = clang++` or `OPTIM = -O2` because GCC 8 or later with `-O3` will produce a segfault when using `mascot-ecdsa-party.x`
- For older hardware, also add `ARCH = -march=native`
- Install [Crypto++](https://www.cryptopp.com) (`libcrypto++-dev` on Ubuntu). We used version 5.6.4, which is the default on Ubuntu 18.04.
 - Compile the binaries: `make -j8 ecdsa`
 - Or compile the static binaries: `make -j8 ecdsa-static`

--- a/FHE/AddableVector.cpp
+++ b/FHE/AddableVector.cpp
@@ -35,5 +35,5 @@ AddableVector<T> AddableVector<T>::mul_by_X_i(int j,
 }

 template
-AddableVector<fixint<0>> AddableVector<fixint<0>>::mul_by_X_i(int j,
-        const FHE_PK& pk) const;
+AddableVector<Int_Random_Coins::rand_type> AddableVector<
+        Int_Random_Coins::rand_type>::mul_by_X_i(int j, const FHE_PK& pk) const;
--- a/FHE/FHE_Keys.cpp
+++ b/FHE/FHE_Keys.cpp
@@ -114,7 +114,9 @@ void FHE_PK::check_noise(const Rq_Element& x, bool check_modulo) const
      noise[i] /= pr;
      m = m > noise[i] ? m : noise[i];
    }
+#ifdef VERBOSE_KEYGEN
  cerr << "max noise: " << m << endl;
+#endif
 }


--- a/FHE/NTL-Subs.cpp
+++ b/FHE/NTL-Subs.cpp
@@ -516,7 +516,9 @@ void init(P2Data& P2D,const Ring& Rg)
  
  imatrix A;
  A.resize(Rg.phi_m(), imatrix::value_type(Gord*gf2n_short::degree()));
-  P2D.A.resize(A[0].size(), imatrix::value_type(A.size()));
+  P2D.A.resize(A[0].size());
+  for (auto& x : P2D.A)
+    x.resize(A.size());
  for (int slot=0; slot<Gord; slot++)
    { for (int co=0; co<gf2n_short::degree(); co++)
        { // Work out how x^co in given slot maps to plaintext vector
--- a/FHE/Random_Coins.h
+++ b/FHE/Random_Coins.h
@@ -9,7 +9,11 @@

 class FHE_PK;

-class Int_Random_Coins : public AddableMatrix<fixint<0>>
+#ifndef N_LIMBS_RAND
+#define N_LIMBS_RAND 0
+#endif
+
+class Int_Random_Coins : public AddableMatrix<fixint<N_LIMBS_RAND>>
 {
  typedef value_type::value_type T;

--- a/FHEOffline/DistKeyGen.cpp
+++ b/FHEOffline/DistKeyGen.cpp
@@ -109,7 +109,6 @@ DistKeyGen::DistKeyGen(const FHE_Params& params, const bigint& p) :
 */
 void DistKeyGen::Gen_Random_Data(PRNG& G)
 {
-    cout << "In Gen Random Data " << endl;
    secret.from_vec(params.sampleHwt(G));
    rc1.generate(G);
    rc2.generate(G);
@@ -228,7 +227,9 @@ void check_randomness(vector<octetStream>& seeds,
    // Re-create the randomness from these seeds
    for (int i = 0; i < num_players; i++)
      { G.SetSeed(seeds[i].get_data());
+#ifdef VERBOSE_KEYGEN
        cout << "\tSeed for player " << i << " is..." << seeds[i] << endl;
+#endif
        playerKeys[i].Gen_Random_Data(G);
        globalKey += playerKeys[i];
      }
@@ -292,22 +293,27 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
 {
  const FHE_Params& params=pk.get_params();

+#ifdef VERBOSE_KEYGEN
  double start,stop;
  /***********************
   *       Step 1        *
   ***********************/
  start=clock();
+#endif

  // First compute and commit to the challenge value
  vector<unsigned int> e(P.num_players());
  vector<octetStream> Comm_e(P.num_players());
  vector<octetStream> Open_e(P.num_players());
  Commit_To_Challenge(e,Comm_e,Open_e,P,num_runs);
+
+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 1 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 2        *
@@ -319,11 +325,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  vector<PRNG> G(num_runs);
  Commit_To_Seeds(G,seeds,Comm_seeds,Open_seeds,P,num_runs);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 2 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 2.5      *
@@ -340,28 +348,27 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
      keys[i].Gen_Random_Data(G[i]);
      a[i][P.my_num()] = keys[i].a;
    }
-  cout << "Generated Random Vals" << endl;

  if (commit)
    {
      // Do Commit and Open to Get a
      Commit_And_Open(a,P,num_runs);
-      cout << "Finished Commit and Open" << endl;
    }
  else
    {
      Transmit_Data(a,P,num_runs);
-      cout << "Finished open" << endl;
    }
  for (int i=0; i<num_runs; i++)
    keys[i].sum_a(a[i]);

  a.clear();
+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 2.5 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 3        *
@@ -373,11 +380,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
      b[i][P.my_num()] = keys[i].b;
    }

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 3 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 4        *
@@ -387,11 +396,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  else
    Transmit_Data(b,P,num_runs);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 4 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *     Step 5/6        *
@@ -404,11 +415,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
    }

  b.clear();
+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 5/6 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 7        *
@@ -418,11 +431,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  else
    Transmit_Data(enc_dash,P,num_runs);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 7 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *    Step 8/9/10      *
@@ -434,11 +449,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
      enc[i][P.my_num()] = keys[i].enc;
   }

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 8/9/10 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 11       *
@@ -448,11 +465,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  else
    Transmit_Data(enc,P,num_runs);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 11 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *      Step 12        *
@@ -460,11 +479,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  for (int i=0; i<num_runs; i++)
    keys[i].sum_enc(enc[i]);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 12 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *     Step 13/14      *
@@ -472,11 +493,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,

  int challenge=Open_Challenge(e,Open_e,Comm_e,P,num_runs);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 13/14 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  /***********************
   *       Step 15       *
@@ -489,7 +512,10 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  /* Now Open All Bar The Challenge Run */
  for (int i = 0; i < num_runs; i++)
    { if (i != challenge)
-        { cout << "Checking run " << i << endl;
+        {
+#ifdef VERBOSE_KEYGEN
+          cout << "Checking run " << i << endl;
+#endif
          check_randomness(seeds[i], keys[i].enc, keys[i].pk, keys[i].enc_dash, P.num_players());
        }
    }
@@ -497,15 +523,19 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
  // Set the key to the chosen run's output
  keys[challenge].finalize(pk, sk);

+#ifdef VERBOSE_KEYGEN
  cout << "Done Step 15 " << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
  start=clock();
+#endif

  P.Check_Broadcast();
+#ifdef VERBOSE_KEYGEN
  cout << "Broadcast check all passed" << endl;

  stop=clock();
  cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
+#endif
 }
--- a/FHEOffline/Prover.cpp
+++ b/FHEOffline/Prover.cpp
@@ -15,13 +15,13 @@ Prover<FD,U>::Prover(Proof& proof, const FD& FieldD) :
  s.resize(proof.V, proof.pk->get_params());
  y.resize(proof.V, FieldD);
 #ifdef LESS_ALLOC_MORE_MEM
-  s.allocate_slots(bigint(1) << proof.B_rand_length);
-  y.allocate_slots(bigint(1) << proof.B_plain_length);
  t = s[0];
  z = y[0];
  // extra limb to prevent reallocation
  t.allocate_slots(bigint(1) << (proof.B_rand_length + 64));
  z.allocate_slots(bigint(1) << (proof.B_plain_length + 64));
+  s.allocate_slots(bigint(1) << proof.B_rand_length);
+  y.allocate_slots(bigint(1) << proof.B_plain_length);
 #endif
 }

--- a/FHEOffline/SimpleMachine.cpp
+++ b/FHEOffline/SimpleMachine.cpp
@@ -299,12 +299,6 @@ void MachineBase::run()
            << timer.elapsed() << " seconds" << endl;
    cout << "CPU time: " << cpu_timer.elapsed() << endl;

-    extern unsigned long long sent_amount, sent_counter;
-    cout << "Data sent = " << sent_amount << " bytes in " << sent_counter
-            << " calls, ";
-    cout << sent_amount / sent_counter / N.num_players() << " bytes per call"
-            << endl;
-
    cout << "Time: " << timer.elapsed() << endl;
    cout << "Throughput: " << total / timer.elapsed() << endl;
    mult_performance();
--- a/GC/MaliciousRepSecret.h
+++ b/GC/MaliciousRepSecret.h
@@ -52,46 +52,59 @@ public:
    }
 };

-class MaliciousRepSecret : public ReplicatedSecret<MaliciousRepSecret>
+template<class U>
+class MalRepSecretBase : public ReplicatedSecret<U>
 {
-    typedef ReplicatedSecret<MaliciousRepSecret> super;
+    typedef ReplicatedSecret<U> super;

 public:
-    typedef Memory<MaliciousRepSecret> DynamicMemory;
+    typedef Memory<U> DynamicMemory;

-    typedef MaliciousRepMC<MaliciousRepSecret> MC;
+    typedef MaliciousRepMC<U> MC;
    typedef MC MAC_Check;

-    typedef Beaver<MaliciousRepSecret> Protocol;
-    typedef ReplicatedInput<MaliciousRepSecret> Input;
-    typedef RepPrep<MaliciousRepSecret> LivePrep;
+    typedef ReplicatedInput<U> Input;
+    typedef RepPrep<U> LivePrep;

-    typedef MaliciousRepSecret part_type;
-    typedef MaliciousRepSecret whole_type;
-
-    typedef SmallMalRepSecret small_type;
+    typedef U part_type;
+    typedef U whole_type;

    static const bool expensive_triples = true;

-    static MC* new_mc(mac_key_type)
+    static MC* new_mc(BitVec)
    {
        try
        {
-            if (ThreadMaster<MaliciousRepSecret>::s().machine.more_comm_less_comp)
-                return new CommMaliciousRepMC<MaliciousRepSecret>;
+            if (ThreadMaster<U>::s().machine.more_comm_less_comp)
+                return new CommMaliciousRepMC<U>;
        }
        catch(no_singleton& e)
        {
        }
-        return new HashMaliciousRepMC<MaliciousRepSecret>;
+        return new HashMaliciousRepMC<U>;
    }

-    static MaliciousRepSecret constant(const BitVec& other, int my_num, const BitVec& alphai)
+    static U constant(const BitVec& other, int my_num, const BitVec& alphai)
    {
        (void) my_num, (void) alphai;
        return other;
    }

+    MalRepSecretBase() {}
+    template<class T>
+    MalRepSecretBase(const T& other) : super(other) {}
+};
+
+class MaliciousRepSecret : public MalRepSecretBase<MaliciousRepSecret>
+{
+    typedef MaliciousRepSecret This;
+    typedef MalRepSecretBase<This> super;
+
+public:
+    typedef Beaver<MaliciousRepSecret> Protocol;
+
+    typedef SmallMalRepSecret small_type;
+
    MaliciousRepSecret() {}
    template<class T>
    MaliciousRepSecret(const T& other) : super(other) {}
--- a/GC/PostSacriBin.cpp
+++ b/GC/PostSacriBin.cpp
@@ -0,0 +1,120 @@
+/*
+ * Abfllnoww.cpp
+ *
+ */
+
+#include "PostSacriBin.h"
+
+#include "Processor/Processor.h"
+
+#include "Protocols/Replicated.hpp"
+#include "Protocols/MaliciousRepMC.hpp"
+#include "ShareSecret.hpp"
+
+namespace GC
+{
+
+PostSacriBin::PostSacriBin(Player& P) :
+        ReplicatedBase(P), honest(P)
+{
+}
+
+PostSacriBin::~PostSacriBin()
+{
+    if (not inputs.empty())
+    {
+        cerr << "unchecked ANDs" << endl;
+        terminate();
+    }
+}
+
+void PostSacriBin::init_mul(SubProcessor<T>* proc)
+{
+    assert(proc != 0);
+    init_mul(proc->DataF, proc->MC);
+}
+
+void PostSacriBin::init_mul(Preprocessing<T>&, T::MC&)
+{
+    if ((int) inputs.size() >= OnlineOptions::singleton.batch_size)
+        check();
+    honest.init_mul();
+}
+
+PostSacriBin::T::clear PostSacriBin::prepare_mul(const T& x, const T& y, int n)
+{
+    honest.prepare_mul(x, y, n);
+    inputs.push_back({{x.mask(n), y.mask(n)}});
+    return {};
+}
+
+void PostSacriBin::exchange()
+{
+    honest.exchange();
+}
+
+PostSacriBin::T PostSacriBin::finalize_mul(int n)
+{
+    auto res = honest.finalize_mul(n);
+    outputs.push_back({res, n});
+    return res;
+}
+
+void PostSacriBin::check()
+{
+    vector<array<T, 3>> to_check;
+    assert(inputs.size() == outputs.size());
+    for (size_t i = 0; i < inputs.size(); i++)
+        to_check.push_back({{inputs[i][0], inputs[i][1], outputs[i].first}});
+    GlobalPRNG G(P);
+    for (size_t i = 0; i < inputs.size(); i++)
+        to_check.push_back(get_d1_triple(G, outputs[i].second));
+    HashMaliciousRepMC<T> MC;
+    vector<array<T, 3>> _(N);
+    TripleShuffleSacrifice<T>(2, 6).triple_sacrifice(_, to_check, P, MC, 0, inputs.size());
+    MC.Check(P);
+    inputs.clear();
+    outputs.clear();
+}
+
+array<PostSacriBin::T, 3> PostSacriBin::get_d1_triple(GlobalPRNG& G, int n_bits)
+{
+    while (d1.size() < N)
+        d1.push_back(get_d2_triple(T::N_BITS));
+    int i = G.get_uint(N);
+    auto tmp = d1.at(i).mask(n_bits);
+    d1[i] <<= n_bits;
+    d1[i] ^= get_d2_triple(n_bits);
+    array<T, 3> res({{tmp[0], tmp[1], tmp[2]}});
+    return res;
+}
+
+array<PostSacriBin::T, 3> PostSacriBin::get_d2_triple(int n_bits)
+{
+    return get_triple_no_count(n_bits);
+}
+
+void PostSacriBin::get(Dtype type, T* res)
+{
+    assert(type == DATA_TRIPLE);
+
+    if (d2.empty())
+    {
+        TripleShuffleSacrifice<T> sacrifice(2, 6);
+        vector<array<T, 3>> check_triples;
+
+        // optimistic triple generation
+        Replicated<T> protocol(P);
+        generate_triples(check_triples, 2 * N + 6, &protocol, T::N_BITS);
+        HashMaliciousRepMC<T> MC;
+        sacrifice.triple_sacrifice(d2, check_triples, P, MC, 0);
+        MC.Check(P);
+        assert(d2.size() == N);
+    }
+
+    for (int i = 0; i < 3; i++)
+        res[i] = d2.back()[i];
+    d2.pop_back();
+}
+
+} /* namespace GC */
--- a/GC/PostSacriBin.h
+++ b/GC/PostSacriBin.h
@@ -0,0 +1,52 @@
+/*
+ * Abfllnoww.h
+ *
+ */
+
+#ifndef GC_POSTSACRIBIN_H_
+#define GC_POSTSACRIBIN_H_
+
+#include "PostSacriSecret.h"
+#include "Protocols/Replicated.h"
+#include "ShiftableTripleBuffer.h"
+
+namespace GC
+{
+
+class PostSacriBin : public ReplicatedBase,
+        public ProtocolBase<PostSacriSecret>,
+        ShiftableTripleBuffer<PostSacriSecret>
+{
+    typedef PostSacriSecret T;
+
+    Replicated<T> honest;
+
+    vector<array<T, 2>> inputs;
+    vector<pair<T, int>> outputs;
+
+    // as in Araki et al. (S&P'17)
+    vector<FixedVec<T, 3>> d1;
+    vector<array<T, 3>> d2;
+
+    array<T, 3> get_d1_triple(GlobalPRNG& G, int n_bits);
+    array<T, 3> get_d2_triple(int n_bits);
+    void get(Dtype type, T* res);
+
+    const size_t N = 1 << 20;
+
+public:
+    PostSacriBin(Player& P);
+    ~PostSacriBin();
+
+    void init_mul(Preprocessing<T>&, T::MC&);
+    void init_mul(SubProcessor<T>* proc);
+    T::clear prepare_mul(const T& x, const T& y, int n = -1);
+    void exchange();
+    T finalize_mul(int n = -1);
+
+    void check();
+};
+
+} /* namespace GC */
+
+#endif /* GC_POSTSACRIBIN_H_ */
--- a/GC/PostSacriSecret.h
+++ b/GC/PostSacriSecret.h
@@ -0,0 +1,37 @@
+/*
+ * AbfllnowShare.h
+ *
+ */
+
+#ifndef GC_POSTSACRISECRET_H_
+#define GC_POSTSACRISECRET_H_
+
+#include "MaliciousRepSecret.h"
+
+namespace GC
+{
+
+class PostSacriBin;
+
+class PostSacriSecret : public MalRepSecretBase<PostSacriSecret>
+{
+    typedef PostSacriSecret This;
+    typedef MalRepSecretBase<This> super;
+
+public:
+    typedef PostSacriBin Protocol;
+
+    PostSacriSecret()
+    {
+    }
+
+    template<class T>
+    PostSacriSecret(const T& other) :
+            super(other)
+    {
+    }
+};
+
+}
+
+#endif
--- a/GC/Program.hpp
+++ b/GC/Program.hpp
@@ -14,6 +14,8 @@

 #include "Tools/callgrind.h"

+#include "Processor/Instruction.hpp"
+
 namespace GC
 {

--- a/GC/ShareParty.h
+++ b/GC/ShareParty.h
@@ -22,7 +22,7 @@ class ShareParty : public ThreadMaster<T>
 {
    static ShareParty<T>* singleton;

-    ez::ezOptionParser opt;
+    ez::ezOptionParser& opt;
    OnlineOptions online_opts;

 public:
@@ -30,7 +30,8 @@ public:

    typename T::mac_key_type mac_key;

-    ShareParty(int argc, const char** argv, int default_batch_size = 0);
+    ShareParty(int argc, const char** argv, ez::ezOptionParser& opt,
+            int default_batch_size = 0);

    Thread<T>* new_thread(int i);

--- a/GC/ShareParty.hpp
+++ b/GC/ShareParty.hpp
@@ -17,6 +17,10 @@

 #include "ShareThread.hpp"
 #include "RepPrep.hpp"
+#include "ThreadMaster.hpp"
+#include "Thread.hpp"
+#include "ShareSecret.hpp"
+
 #include "Protocols/Replicated.hpp"
 #include "Protocols/ReplicatedPrep.hpp"
 #include "Protocols/MaliciousRepMC.hpp"
@@ -29,16 +33,31 @@ template<class T>
 ShareParty<T>* ShareParty<T>::singleton = 0;

 template<class T>
-ShareParty<T>::ShareParty(int argc, const char** argv, int default_batch_size) :
-        ThreadMaster<T>(online_opts), online_opts(opt, argc, argv,
+void simple_binary_main(int argc, const char** argv, int default_batch_size = 0)
+{
+    ez::ezOptionParser opt;
+    ShareParty<T>(argc, argv, opt, default_batch_size);
+}
+
+template<class T>
+ShareParty<T>::ShareParty(int argc, const char** argv, ez::ezOptionParser& opt,
+        int default_batch_size) :
+        ThreadMaster<T>(online_opts), opt(opt),
+        online_opts(this->opt, argc, argv,
                default_batch_size)
 {
    if (singleton)
        throw runtime_error("there can only be one");
    singleton = this;

+    int nplayers = 0;
+    opt.parse(argc, argv);
+    if (opt.get("-N"))
+        opt.get("-N")->getInt(nplayers);
+    opt.resetArgs();
    NetworkOptionsWithNumber network_opts(opt, argc, argv,
-            T::dishonest_majority ? 2 : 3, T::variable_players);
+            nplayers > 0 ? nplayers : (T::dishonest_majority ? 2 : 3),
+            T::variable_players and nplayers == 0);
    if (T::dishonest_majority)
        opt.add(
                "", // Default.
--- a/GC/square64.cpp
+++ b/GC/square64.cpp
@@ -25,7 +25,7 @@ union matrix32x8

    void transpose(square64& output, int x, int y)
    {
-#ifdef __AVX2__
+#if defined(__AVX2__) || !defined(__x86_64__)
        if (cpu_has_avx2())
        {
            for (int j = 0; j < 8; j++)
@@ -66,7 +66,7 @@ case I: \
 void zip(int chunk_size, __m256i& lows, __m256i& highs,
        const __m256i& a, const __m256i& b)
 {
-#ifdef __AVX2__
+#if defined(__AVX2__) || !defined(__x86_64__)
    if (cpu_has_avx2())
    {
        switch (chunk_size)
--- a/GC/square64.h
+++ b/GC/square64.h
@@ -6,10 +6,10 @@
 #ifndef GC_SQUARE64_H_
 #define GC_SQUARE64_H_

-#include <immintrin.h>
 #include <string.h>
 #include <cstdint>
 #include "Tools/int.h"
+#include "Tools/intrinsics.h"

 union square64
 {
--- a/Machines/ccd-party.cpp
+++ b/Machines/ccd-party.cpp
@@ -17,5 +17,7 @@
 int main(int argc, const char** argv)
 {
    gf2n_short::init_field(40);
-    GC::ShareParty<GC::CcdSecret<gf2n_short>>(argc, argv);
+    ez::ezOptionParser opt;
+    ShamirOptions::singleton = {opt, argc, argv};
+    GC::ShareParty<GC::CcdSecret<gf2n_short>>(argc, argv, opt);
 }
--- a/Machines/malicious-ccd-party.cpp
+++ b/Machines/malicious-ccd-party.cpp
@@ -18,5 +18,7 @@
 int main(int argc, const char** argv)
 {
    gf2n_short::init_field(40);
-    GC::ShareParty<GC::MaliciousCcdSecret<gf2n_short>>(argc, argv);
+    ez::ezOptionParser opt;
+    ShamirOptions::singleton = {opt, argc, argv};
+    GC::ShareParty<GC::MaliciousCcdSecret<gf2n_short>>(argc, argv, opt);
 }
--- a/Machines/malicious-rep-bin-party.cpp
+++ b/Machines/malicious-rep-bin-party.cpp
@@ -21,5 +21,5 @@

 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::MaliciousRepSecret>(argc, argv);
+    GC::simple_binary_main<GC::MaliciousRepSecret>(argc, argv);
 }
--- a/Machines/ps-rep-bin-party.cpp
+++ b/Machines/ps-rep-bin-party.cpp
@@ -0,0 +1,14 @@
+/*
+ * abfllnow-party.cpp
+ *
+ */
+
+#include "GC/PostSacriBin.h"
+#include "GC/PostSacriSecret.h"
+
+#include "GC/ShareParty.hpp"
+
+int main(int argc, const char** argv)
+{
+    GC::simple_binary_main<GC::PostSacriSecret>(argc, argv);
+}
--- a/Machines/replicated-bin-party.cpp
+++ b/Machines/replicated-bin-party.cpp
@@ -20,5 +20,5 @@

 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::SemiHonestRepSecret>(argc, argv);
+    GC::simple_binary_main<GC::SemiHonestRepSecret>(argc, argv);
 }
--- a/Machines/semi-bin-party.cpp
+++ b/Machines/semi-bin-party.cpp
@@ -23,5 +23,5 @@

 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::SemiSecret>(argc, argv);
+    GC::simple_binary_main<GC::SemiSecret>(argc, argv);
 }
--- a/Machines/tinier-party.cpp
+++ b/Machines/tinier-party.cpp
@@ -27,5 +27,5 @@
 int main(int argc, const char** argv)
 {
    gf2n_short::init_field(40);
-    GC::ShareParty<GC::TinierSecret<gf2n_short>>(argc, argv, 1000);
+    GC::simple_binary_main<GC::TinierSecret<gf2n_short>>(argc, argv, 1000);
 }
--- a/Machines/tiny-party.cpp
+++ b/Machines/tiny-party.cpp
@@ -26,5 +26,5 @@

 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::TinySecret<40>>(argc, argv, 1000);
+    GC::simple_binary_main<GC::TinySecret<40>>(argc, argv, 1000);
 }
--- a/38
+++ b/38
@@ -26,7 +26,12 @@ VM = $(PROCESSOR) $(COMMON) GC/square64.o GC/Instruction.o OT/OTTripleSetup.o OT

 LIB = libSPDZ.a
 LIBRELEASE = librelease.a
+
+ifeq ($(AVX_OT), 0)
+LIBSIMPLEOT = ECDSA/P256Element.o
+else
 LIBSIMPLEOT = SimpleOT/libsimpleot.a
+endif

 # used for dependency generation
 OBJS = $(BMR) $(FHEOFFLINE) $(TINYOTOFFLINE) $(YAO) $(COMPLETE) $(patsubst %.cpp,%.o,$(wildcard Machines/*.cpp Utils/*.cpp))
@@ -47,7 +52,6 @@ binary: rep-bin yao semi-bin-party.x tinier-party.x tiny-party.x ccd-party.x mal

 ifeq ($(USE_NTL),1)
 all: overdrive she-offline
-gear: cowgear-party.x chaigear-party.x lowgear-party.x highgear-party.x
 arithmetic: hemi-party.x soho-party.x gear
 endif

@@ -73,13 +77,14 @@ yao: yao-party.x

 she-offline: Check-Offline.x spdz2-offline.x

-overdrive: simple-offline.x pairwise-offline.x cnc-offline.x
+overdrive: simple-offline.x pairwise-offline.x cnc-offline.x gear
+gear: cowgear-party.x chaigear-party.x lowgear-party.x highgear-party.x

 rep-field: malicious-rep-field-party.x replicated-field-party.x ps-rep-field-party.x

 rep-ring: replicated-ring-party.x brain-party.x malicious-rep-ring-party.x ps-rep-ring-party.x rep4-ring-party.x

-rep-bin: replicated-bin-party.x malicious-rep-bin-party.x Fake-Offline.x
+rep-bin: replicated-bin-party.x malicious-rep-bin-party.x ps-rep-bin-party.x Fake-Offline.x

 replicated: rep-field rep-ring rep-bin

@@ -96,6 +101,10 @@ else
 tldr: mpir
 endif

+ifeq ($(MACHINE), aarch64)
+tldr: simde/simde
+endif
+
 shamir: shamir-party.x malicious-shamir-party.x galois-degree.x

 sy: sy-rep-field-party.x sy-rep-ring-party.x sy-shamir-party.x
@@ -107,10 +116,10 @@ $(LIBRELEASE): Protocols/MalRepRingOptions.o $(PROCESSOR) $(COMMON) $(OT) $(GC)
 	$(AR) -csr $@ $^

 static/%.x: Machines/%.o $(LIBRELEASE) $(LIBSIMPLEOT)
-	$(CXX) $(CFLAGS) -o $@ $^ $(LIBRELEASE) $(LIBSIMPLEOT) -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl
+	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++  $(LIBRELEASE) $(LIBSIMPLEOT) $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl

 static/%.x: ECDSA/%.o ECDSA/P256Element.o $(VM) $(OT) $(LIBSIMPLEOT)
-	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) $(ECLIB) -Wl,-Bdynamic -ldl
+	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl

 static-dir:
 	@ mkdir static 2> /dev/null; true
@@ -118,7 +127,7 @@ static-dir:
 static-release: static-dir $(patsubst Machines/%.cpp, static/%.x, $(wildcard Machines/*-party.cpp)) static/emulate.x

 Fake-ECDSA.x: ECDSA/Fake-ECDSA.cpp ECDSA/P256Element.o $(COMMON) Processor/PrepBase.o
-	$(CXX) -o $@ $^ $(CFLAGS) $(LDLIBS) $(ECLIB)
+	$(CXX) -o $@ $^ $(CFLAGS) $(LDLIBS)

 Check-Offline.x: $(PROCESSOR)

@@ -167,14 +176,24 @@ secure.x: Utils/secure.o
 %.x: Machines/%.o $(VM) OT/OTTripleSetup.o OT/BaseOT.o $(LIBSIMPLEOT)
 	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS)

+%gear-party.x: Machines/%gear-party.o $(VM) OT/OTTripleSetup.o OT/BaseOT.o $(LIBSIMPLEOT)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
+hemi-party.x: Machines/hemi-party.o $(VM)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
+soho-party.x: Machines/soho-party.o $(VM)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
 %-ecdsa-party.x: ECDSA/%-ecdsa-party.o ECDSA/P256Element.o $(VM)
-	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) $(ECLIB)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS)

 replicated-bin-party.x: GC/square64.o
 replicated-ring-party.x: GC/square64.o
 replicated-field-party.x: GC/square64.o
 brain-party.x: GC/square64.o
 malicious-rep-bin-party.x: GC/square64.o
+ps-rep-bin-party.x: GC/PostSacriBin.o
 semi-bin-party.x: $(VM) $(OT) GC/SemiSecret.o GC/SemiPrep.o GC/square64.o
 tiny-party.x: $(OT)
 tinier-party.x: $(OT)
@@ -220,6 +239,7 @@ static/semi-bmr-party.x: $(BMR)
 static/real-bmr-party.x: $(BMR)
 static/bmr-program-party.x: $(BMR)

+ifeq ($(AVX_OT), 1)
 $(LIBSIMPLEOT): SimpleOT/Makefile
 	$(MAKE) -C SimpleOT

@@ -227,6 +247,7 @@ OT/BaseOT.o: SimpleOT/Makefile

 SimpleOT/Makefile:
 	git submodule update --init SimpleOT
+endif

 .PHONY: Programs/Circuits
 Programs/Circuits:
@@ -259,5 +280,8 @@ mac-setup:
 	-echo MY_LDLIBS += -L/usr/local/opt/openssl/lib >> CONFIG.mine
 	-echo USE_NTL = 1 >> CONFIG.mine

+simde/simde:
+	git submodule update --init simde
+
 clean:
 	-rm -f */*.o *.o */*.d *.d *.x core.* *.a gmon.out */*/*.o static/*.x
--- a/Math/FixedVec.h
+++ b/Math/FixedVec.h
@@ -277,6 +277,12 @@ public:
        return res;
    }

+    FixedVec<T, L>& operator<<=(int i)
+    {
+        *this = *this << i;
+        return *this;
+    }
+
    FixedVec<T, L>& operator>>=(int i)
    {
        *this = *this >> i;
--- a/Math/Integer.h
+++ b/Math/Integer.h
@@ -143,19 +143,6 @@ class Integer : public IntBase<long>
  friend unsigned int& operator+=(unsigned int& x, const Integer& other) { return x += other.a; }

  long operator-() const { return -a; }
-
-  void add(const Integer& x, const Integer& y) { *this = x + y; }
-  void sub(const Integer& x, const Integer& y) { *this = x - y; }
-  void mul(const Integer& x, const Integer& y) { *this = x * y; }
-
-  void mul(const Integer& x) { *this = *this * x; }
-
-  void AND(const Integer& x, const Integer& y) { *this = x & y; }
-  void OR(const Integer& x, const Integer& y) { *this = x | y; }
-  void XOR(const Integer& x, const Integer& y) { *this = x ^ y; }
-  void SHL(const Integer& x, const Integer& y) { *this = x << y; }
-  // unsigned shift for Mod2m
-  void SHR(const Integer& x, const Integer& y) { *this = (unsigned long)x.a >> y.a; }
 };

 inline string to_string(const Integer& x)
--- a/Math/Zp_Data.cpp
+++ b/Math/Zp_Data.cpp
@@ -64,7 +64,9 @@ void Zp_Data::init(const bigint& p,bool mont)

 void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y,int t) const
 {
-  mp_limb_t ans[2*MAX_MOD_SZ+1],u;
+  mp_limb_t ans[2 * MAX_MOD_SZ + 1], u, yy[t + 1];
+  inline_mpn_copyi(yy, y, t);
+  yy[t] = 0;
  // First loop
  u=x[0]*y[0]*pi;
  ans[t]  = mpn_mul_1(ans,y,t,x[0]);
@@ -73,8 +75,8 @@ void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y,int t
    { // u=(ans0+xi*y0)*pd
      u=(ans[i]+x[i]*y[0])*pi;
      // ans=ans+xi*y+u*pr
-      ans[t+i]+=mpn_addmul_1(ans+i,y,t,x[i]);
-      ans[t+i+1]=mpn_addmul_1(ans+i,prA,t+1,u);
+      ans[t+i+1]=mpn_addmul_1(ans+i,yy,t+1,x[i]);
+      ans[t+i+1]+=mpn_addmul_1(ans+i,prA,t+1,u);
    }
  // if (ans>=pr) { ans=z-pr; }
  // else         { z=ans;    }
--- a/Math/Zp_Data.h
+++ b/Math/Zp_Data.h
@@ -13,8 +13,8 @@
 #include "Math/bigint.h"
 #include "Math/mpn_fixed.h"
 #include "Tools/random.h"
+#include "Tools/intrinsics.h"

-#include <smmintrin.h>
 #include <iostream>
 using namespace std;

@@ -43,6 +43,8 @@ class Zp_Data
  void Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y, int t) const;
  void Mont_Mult_variable(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y) const
  { Mont_Mult(z, x, y, t); }
+  void Mont_Mult_max(mp_limb_t* z, const mp_limb_t* x, const mp_limb_t* y,
+      int max_t) const;

  public:

@@ -125,7 +127,7 @@ inline void Zp_Data::Add<0>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y
 template<>
 inline void Zp_Data::Add<1>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y) const
 {
-#ifdef __clang__
+#if defined(__clang__) || !defined(__x86_64__)
  Add<0>(ans, x, y);
 #else
  *ans = *x + *y;
@@ -139,7 +141,7 @@ inline void Zp_Data::Add<1>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y
 template<>
 inline void Zp_Data::Add<2>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y) const
 {
-#ifdef __clang__
+#if defined(__clang__) || !defined(__x86_64__)
  Add<0>(ans, x, y);
 #else
  __uint128_t a, b, p;
@@ -229,7 +231,7 @@ inline void Zp_Data::Mont_Mult_(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t*
    { // u=(ans0+xi*y0)*pd
      u=(ans[i]+x[i]*y[0])*pi;
      // ans=ans+xi*y+u*pr
-      mpn_addmul_1_fixed_<T + 1, T>(ans+i,y,x[i]);
+      mpn_addmul_1_fixed_<T + 2, T>(ans+i,y,x[i]);
      mpn_addmul_1_fixed_<T + 2, T + 1>(ans+i,prA,u);
    }
  // if (ans>=pr) { ans=z-pr; }
@@ -276,4 +278,11 @@ inline void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t*
  }
 }

+inline void Zp_Data::Mont_Mult_max(mp_limb_t* z, const mp_limb_t* x,
+    const mp_limb_t* y, int max_t) const
+{
+  assert(t <= max_t);
+  Mont_Mult(z, x, y);
+}
+
 #endif
--- a/Math/bigint.cpp
+++ b/Math/bigint.cpp
@@ -153,6 +153,11 @@ bigint::bigint(const gfpvar& other)
  to_bigint(*this, other.get(), other.get_ZpD());
 }

+bigint::bigint(const mp_limb_t* data, size_t n_limbs)
+{
+  mpz_import(get_mpz_t(), n_limbs, -1, 8, -1, 0, data);
+}
+
 string to_string(const bigint& x)
 {
  stringstream ss;
--- a/Math/bigint.h
+++ b/Math/bigint.h
@@ -63,6 +63,7 @@ public:
  bigint(const fixint<L>& x) : bigint(typename fixint<L>::super(x)) {}
  bigint(const Integer& x);
  bigint(const GC::Clear& x);
+  bigint(const mp_limb_t* data, size_t n_limbs);

  bigint& operator=(int n);
  bigint& operator=(long n);
@@ -75,6 +76,11 @@ public:
  template<int K>
  bigint& operator=(const SignedZ2<K>& x);

+  template<int X, int L>
+  bigint& from_signed(const gfp_<X, L>& other);
+  template<class T>
+  bigint& from_signed(const T& other);
+
  void allocate_slots(const bigint& x) { *this = x; }
  int get_min_alloc() { return get_mpz_t()->_mp_alloc; }

--- a/Math/bigint.hpp
+++ b/Math/bigint.hpp
@@ -9,12 +9,27 @@
 #include "bigint.h"
 #include "Integer.h"

+template<int X, int L>
+bigint& bigint::from_signed(const gfp_<X, L>& other)
+{
+    to_signed_bigint(*this, other);
+    return *this;
+}
+
+template<class T>
+bigint& bigint::from_signed(const T& other)
+{
+    *this = other;
+    return *this;
+}
+
 template<class T>
 mpf_class bigint::get_float(T v, T p, T z, T s)
 {
    // MPIR can't handle more precision in exponent
    Integer exp = Integer(p, 31).get();
-    bigint tmp = v;
+    bigint tmp;
+    tmp.from_signed(v);
    mpf_class res = tmp;
    if (exp > 0)
        mpf_mul_2exp(res.get_mpf_t(), res.get_mpf_t(), exp.get());
--- a/Math/fixint.h
+++ b/Math/fixint.h
@@ -11,7 +11,7 @@
 template<int L>
 class fixint : public SignedZ2<64 * (L + 1)>
 {
-    static const int OVERFLOW = 60;
+    static const int N_OVERFLOW = 60;

 public:
    typedef SignedZ2<64 * (L + 1)> super;
@@ -24,7 +24,7 @@ public:
    fixint(const T& other) :
            super(other)
    {
-        auto check = mp_limb_signed_t(this->a[this->N_WORDS - 1]) >> OVERFLOW;
+        auto check = mp_limb_signed_t(this->a[this->N_WORDS - 1]) >> N_OVERFLOW;
        assert(check == 0 or check == -1);
    }

@@ -70,10 +70,10 @@ public:
    void allocate_slots(const T& limit)
    {
        int n_bits = this->size_in_bits();
-        if (numBits(limit) - OVERFLOW > n_bits)
+        if (numBits(limit) - N_OVERFLOW > n_bits)
        {
-        cerr << "cannot hold " << numBits(limit) << " bits, " << n_bits
-                << " available" << endl;
+            cerr << "maybe change N_LIMBS_RAND to at least "
+                    << ((numBits(limit) - N_OVERFLOW) / 64) << endl;
            throw runtime_error("fixed-length integer too small");
        }
    }
--- a/Math/gf2n.cpp
+++ b/Math/gf2n.cpp
@@ -2,12 +2,10 @@
 #include "Math/gf2n.h"
 #include "Math/Bit.h"

+#include "Tools/intrinsics.h"
 #include "Tools/Exceptions.h"

 #include <stdint.h>
-#include <wmmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>

 const false_type ValueInterface::characteristic_two;
 const false_type ValueInterface::prime_field;
@@ -16,6 +14,9 @@ const false_type ValueInterface::invertible;
 const true_type gf2n_short::characteristic_two;
 const true_type gf2n_long::characteristic_two;

+const true_type gf2n_short::invertible;
+const true_type gf2n_long::invertible;
+
 int gf2n_short::n = 0;
 int gf2n_short::t1;
 int gf2n_short::t2;
--- a/Math/gf2nlong.cpp
+++ b/Math/gf2nlong.cpp
@@ -6,12 +6,10 @@
 #include "gf2nlong.h"
 #include "gf2n.h"

+#include "Tools/intrinsics.h"
 #include "Tools/Exceptions.h"

 #include <stdint.h>
-#include <wmmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>


 bool is_ge(__m128i a, __m128i b)
--- a/Math/gf2nlong.h
+++ b/Math/gf2nlong.h
@@ -12,9 +12,8 @@
 #include <iostream>
 using namespace std;

-#include <smmintrin.h>
-
 #include "Tools/random.h"
+#include "Tools/intrinsics.h"
 #include "Math/field_types.h"
 #include "Math/bigint.h"

@@ -283,7 +282,7 @@ inline __m128i software_clmul(__m128i a, __m128i b, int choice)
 template<int choice>
 inline __m128i clmul(__m128i a, __m128i b)
 {
-#ifdef __PCLMUL__
+#if defined(__PCLMUL__) || !defined(__x86_64__)
    if (cpu_has_pclmul())
    {
        return _mm_clmulepi64_si128(a, b, choice);
--- a/Math/gfpvar.cpp
+++ b/Math/gfpvar.cpp
@@ -9,6 +9,9 @@

 #include "gfp.hpp"

+const true_type gfpvar::invertible;
+const true_type gfpvar::prime_field;
+
 Zp_Data gfpvar::ZpD;

 string gfpvar::type_string()
--- a/Math/modp.h
+++ b/Math/modp.h
@@ -151,7 +151,7 @@ template<int L>
 inline void Mul(modp_<L>& ans,const modp_<L>& x,const modp_<L>& y,const Zp_Data& ZpD)
 {
  if (ZpD.montgomery)
-    { ZpD.Mont_Mult(ans.x,x.x,y.x); }
+    { ZpD.Mont_Mult_max(ans.x,x.x,y.x,L); }
  else
    { //ans.x=(x.x*y.x)%ZpD.pr;
      mp_limb_t aa[2*L],q[2*L];
--- a/Math/modp.hpp
+++ b/Math/modp.hpp
@@ -253,7 +253,7 @@ void Inv(modp_<L>& ans,const modp_<L>& x,const Zp_Data& ZpD)
  else
    { for (int i=sz; i<ZpD.t; i++) { ans.x[i]=0; } }
  if (ZpD.montgomery)
-    { ZpD.Mont_Mult(ans.x,ans.x,ZpD.R3); }
+    { ZpD.Mont_Mult_max(ans.x,ans.x,ZpD.R3,L); }
 }


--- a/Math/mpn_fixed.h
+++ b/Math/mpn_fixed.h
@@ -9,10 +9,10 @@
 #include <mpir.h>
 #include <string.h>
 #include <assert.h>
-#include <x86intrin.h>

 #include "Tools/avx_memcpy.h"
 #include "Tools/cpu_support.h"
+#include "Tools/intrinsics.h"

 inline void inline_mpn_zero(mp_limb_t* x, mp_size_t size)
 {
@@ -50,6 +50,7 @@ inline void mpn_add_fixed_n<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
    *res = *x + *y;
 }

+#ifdef __x86_64__
 template <>
 inline void mpn_add_fixed_n<2>(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y)
 {
@@ -91,6 +92,7 @@ inline void mpn_add_fixed_n<4>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
            : "cc"
    );
 }
+#endif

 #ifdef __clang__
 inline char clang_add_carry(char carryin, unsigned long x, unsigned long y, unsigned long& res)
@@ -133,16 +135,15 @@ mp_limb_t mpn_add_fixed_n_with_carry(mp_limb_t* res, const mp_limb_t* x, const m

 inline mp_limb_t mpn_sub_n_borrow(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y, int n)
 {
-#ifndef __clang__
-#if __GNUC__ < 7
+#if !defined(__clang__) || (__GNUC__ < 7) || !defined(__x86_64__)
    // GCC 6 can't handle the code below
    return mpn_sub_n(res, x, y, n);
-#endif
-#endif
+#else
    char borrow = 0;
    for (int i = 0; i < n; i++)
        borrow = _subborrow_u64(borrow, x[i], y[i], (unsigned long long*)&res[i]);
    return borrow;
+#endif
 }

 template <int N>
@@ -163,6 +164,7 @@ inline void mpn_sub_fixed_n<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
    *res = *x - *y;
 }

+#ifdef __x86_64__
 template <>
 inline mp_limb_t mpn_sub_fixed_n_borrow<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y)
 {
@@ -235,6 +237,7 @@ inline void mpn_sub_fixed_n<4>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
            : "cc"
    );
 }
+#endif

 inline void mpn_add_n_use_fixed(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y, mp_size_t n)
 {
@@ -260,8 +263,8 @@ template <int L, int M, bool ADD>
 inline void mpn_addmul_1_fixed__(mp_limb_t* res, const mp_limb_t* y, mp_limb_t x)
 {
    mp_limb_t lower[L], higher[L];
-    lower[L - 1] = 0;
-    higher[L - 1] = 0;
+    inline_mpn_zero(higher + M, L - M);
+    inline_mpn_zero(lower + M, L - M);
    for (int j = 0; j < M; j++)
        lower[j] = _mulx_u64(x, y[j], (long long unsigned*)higher + j);
    if (ADD)
--- a/Networking/Server.cpp
+++ b/Networking/Server.cpp
@@ -158,7 +158,7 @@ void* Server::start_in_thread(void* server)
 }

 Server* Server::start_networking(Names& N, int my_num, int nplayers,
-        string hostname, int portnum)
+        string hostname, int portnum, int my_port)
 {
 #ifdef DEBUG_NETWORKING
  cerr << "Starting networking for " << my_num << "/" << nplayers
@@ -173,7 +173,7 @@ Server* Server::start_networking(Names& N, int my_num, int nplayers,
      pthread_create(&thread, 0, Server::start_in_thread,
          server = new Server(nplayers, portnum));
    }
-  N.init(my_num, portnum, Names::DEFAULT_PORT, hostname.c_str());
+  N.init(my_num, portnum, my_port, hostname.c_str());
  if (my_num == 0)
    {
      pthread_join(thread, 0);
--- a/Networking/Server.h
+++ b/Networking/Server.h
@@ -26,7 +26,8 @@ class Server
 public:
    static void* start_in_thread(void* server);
    static Server* start_networking(Names& N, int my_num, int nplayers,
-            string hostname = "localhost", int portnum = 9000);
+            string hostname = "localhost", int portnum = 9000, int my_port =
+                    Names::DEFAULT_PORT);

    Server(int argc, char** argv);
    Server(int nmachines, int PortnumBase);
--- a/Networking/ServerSocket.cpp
+++ b/Networking/ServerSocket.cpp
@@ -6,6 +6,7 @@
 #include <Networking/ServerSocket.h>
 #include <Networking/sockets.h>
 #include "Tools/Exceptions.h"
+#include "Tools/time-func.h"

 #include <netinet/ip.h>
 #include <netinet/tcp.h>
@@ -46,10 +47,10 @@ ServerSocket::ServerSocket(int Portnum) : portnum(Portnum), thread(0)
  gethostname((char*)my_name,512);

  /* bind serv information to mysocket
-   *   - Just assume it will eventually wake up
   */
  fl=1;
-  while (fl!=0)
+  RunningTimer timer;
+  while (fl!=0 and timer.elapsed() < 600)
    { fl=::bind(main_socket, (struct sockaddr *)&serv, sizeof(struct sockaddr));
      if (fl != 0)
        { cerr << "Binding to socket on " << my_name << ":" << Portnum << " failed, trying again in a second ..." << endl;
@@ -136,6 +137,9 @@ void ServerSocket::accept_clients()
      struct sockaddr dest;
      memset(&dest, 0, sizeof(dest));    /* zero the struct before filling the fields */
      int socksize = sizeof(dest);
+#ifdef DEBUG_NETWORKING
+      fprintf(stderr, "Accepting...\n");
+#endif
      int consocket = accept(main_socket, (struct sockaddr *)&dest, (socklen_t*) &socksize);
      if (consocket<0) { error("set_up_socket:accept"); }

--- a/Networking/data.h
+++ b/Networking/data.h
@@ -18,6 +18,18 @@
 #endif


+inline void short_memcpy(void* out, void* in, size_t n_bytes)
+{
+    switch (n_bytes)
+    {
+#define X(N) case N: avx_memcpy<N>(out, in); break;
+    X(1) X(2) X(3) X(4) X(5) X(6) X(7) X(8)
+#undef X
+    default:
+        throw invalid_length("length outside range");
+    }
+}
+
 inline void encode_length(octet *buff, size_t len, size_t n_bytes)
 {
    if (n_bytes > 8)
@@ -31,7 +43,7 @@ inline void encode_length(octet *buff, size_t len, size_t n_bytes)
    }
    // use little-endian for optimization
    uint64_t tmp = htole64(len);
-    avx_memcpy(buff, (void*)&tmp, n_bytes);
+    short_memcpy(buff, (void*)&tmp, n_bytes);
 }

 inline size_t decode_length(octet *buff, size_t n_bytes)
@@ -39,7 +51,7 @@ inline size_t decode_length(octet *buff, size_t n_bytes)
    if (n_bytes > 8)
        throw invalid_length("length field cannot be more than 64 bits");
    uint64_t tmp = 0;
-    avx_memcpy((void*)&tmp, buff, n_bytes);
+    short_memcpy((void*)&tmp, buff, n_bytes);
    return le64toh(tmp);
 }

--- a/Networking/sockets.cpp
+++ b/Networking/sockets.cpp
@@ -9,23 +9,12 @@ using namespace std;

 void error(const char *str)
 {
+  int old_errno = errno;
  char err[1000];
  gethostname(err,1000);
  strcat(err," : ");
  strcat(err,str);
-  perror(err);
-  throw bad_value();
-}
-
-void error(const char *str1,const char *str2)
-{
-  char err[1000];
-  gethostname(err,1000);
-  strcat(err," : ");
-  strcat(err,str1);
-  strcat(err,str2);
-  perror(err);
-  throw bad_value();
+  throw runtime_error(string() + err + " : " + strerror(old_errno));
 }

 void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
@@ -35,7 +24,7 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
   hints.ai_family = AF_INET;
   hints.ai_flags = AI_CANONNAME;

-   octet my_name[512];
+   char my_name[512];
   memset(my_name,0,512*sizeof(octet));
   gethostname((char*)my_name,512);

@@ -88,36 +77,39 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
   int attempts = 0;
   long wait = 1;
   int fl;
+   int connect_errno;
   do
-   {  fl=1;
-      while (fl==1 || errno==EINPROGRESS)
-        {
-          mysocket = socket(AF_INET, SOCK_STREAM, 0);
-          if (mysocket < 0)
-            error("set_up_socket:socket");
+   {
+       mysocket = socket(AF_INET, SOCK_STREAM, 0);
+       if (mysocket < 0)
+         error("set_up_socket:socket");

-          fl=connect(mysocket, addr, len);
-          attempts++;
-          if (fl != 0)
-            {
-              close(mysocket);
-              usleep(wait *= 2);
+       fl = connect(mysocket, addr, len);
+       connect_errno = errno;
+       attempts++;
+       if (fl != 0)
+         {
+           close(mysocket);
+           usleep(wait *= 2);
 #ifdef DEBUG_NETWORKING
-              string msg = "Connecting to " + string(hostname) + ":" +
-                  to_string(Portnum) + " failed";
-              perror(msg.c_str());
+           string msg = "Connecting to " + string(hostname) + ":" +
+               to_string(Portnum) + " failed";
+           errno = connect_errno;
+           perror(msg.c_str());
 #endif
-            }
-        }
+         }
+       errno = connect_errno;
   }
-   while (fl == -1 && (errno == ECONNREFUSED || errno == ETIMEDOUT)
-            && timer.elapsed() < 60);
+   while (fl == -1
+       && (errno == ECONNREFUSED || errno == ETIMEDOUT || errno == EINPROGRESS)
+       && timer.elapsed() < 60);

   if (fl < 0)
     {
-       cout << attempts << " attempts to " << hostname << ":" << Portnum
-           << endl;
-       error("set_up_socket:connect:", hostname);
+       throw runtime_error(
+           string() + "cannot connect from " + my_name + " to " + hostname + ":"
+               + to_string(Portnum) + " after " + to_string(attempts)
+               + " attempts in one minute because " + strerror(connect_errno));
     }

   freeaddrinfo(ai);
@@ -127,9 +119,6 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
  fl= setsockopt(mysocket, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int));
  if (fl<0) { error("set_up_socket:setsockopt");  }

-  fl=setsockopt(mysocket, SOL_SOCKET, SO_REUSEADDR, (char*)&one, sizeof(int));
-  if (fl<0) { error("set_up_socket:setsockopt"); }
-
 #ifdef __APPLE__
  int flags = fcntl(mysocket, F_GETFL, 0);
  fl = fcntl(mysocket, F_SETFL, O_NONBLOCK |  flags);
@@ -147,5 +136,3 @@ void close_client_socket(int socket)
      error(tmp);
    }
 }
-
-unsigned long long sent_amount = 0, sent_counter = 0;
--- a/Networking/sockets.h
+++ b/Networking/sockets.h
@@ -24,7 +24,6 @@
 using namespace std;


-void error(const char *str1,const char *str2);
 void error(const char *str);

 void set_up_client_socket(int& mysocket,const char* hostname,int Portnum);
@@ -42,9 +41,6 @@ template<class T>
 void receive(T socket, octet* msg, size_t len);


-extern unsigned long long sent_amount, sent_counter;
-
-
 inline size_t send_non_blocking(int socket, octet* msg, size_t len)
 {
  int j = send(socket,msg,len,MSG_DONTWAIT);
@@ -66,9 +62,6 @@ inline void send(int socket,octet *msg,size_t len)
    {
      i += send_non_blocking(socket, msg + i, len - i);
    }
-
-  sent_amount += len;
-  sent_counter++;
 }

 template<class T>
--- a/OT/BaseOT.cpp
+++ b/OT/BaseOT.cpp
@@ -7,10 +7,18 @@
 #include <fstream>
 #include <pthread.h>

+#ifndef NO_AVX_OT
 extern "C" {
 #include "SimpleOT/ot_sender.h"
 #include "SimpleOT/ot_receiver.h"
 }
+#endif
+
+#include "ECDSA/P256Element.h"
+
+#ifdef USE_RISTRETTO
+#include "ECDSA/CurveElement.h"
+#endif

 using namespace std;

@@ -70,7 +78,57 @@ void send_if_ot_receiver(TwoPartyPlayer* P, vector<octetStream>& os, OT_ROLE rol

 void BaseOT::exec_base(bool new_receiver_inputs)
 {
-    if (not cpu_has_avx())
+#ifdef NO_AVX_OT
+#ifdef USE_RISTRETTO
+    typedef CurveElement Element;
+#else
+    typedef P256Element Element;
+#endif
+
+    Element::init();
+
+    vector<Element::Scalar> as, bs;
+    vector<Element> As;
+    SeededPRNG G;
+    vector<octetStream> os(2);
+
+    if (ot_role & SENDER)
+        for (int i = 0; i < nOT; i++)
+        {
+            as.push_back(G.get<Element::Scalar>());
+            As.push_back(as.back());
+            As.back().pack(os[0]);
+        }
+
+    send_if_ot_sender(P, os, ot_role);
+    os[0].reset_write_head();
+
+    if (ot_role & RECEIVER)
+        for (int i = 0; i < nOT; i++)
+        {
+            if (new_receiver_inputs)
+                receiver_inputs[i] = G.get_bit();
+            auto b = G.get<Element::Scalar>();
+            Element B = b;
+            auto A = os[1].get<Element>();
+            if (receiver_inputs[i])
+                B += A;
+            B.pack(os[0]);
+            receiver_outputs[i] = (A * b).hash(AES_BLK_SIZE);
+        }
+
+    send_if_ot_receiver(P, os, ot_role);
+
+    if (ot_role & SENDER)
+        for (int i = 0; i < nOT; i++)
+        {
+            auto B = os[1].get<Element>();
+            sender_inputs.at(i).at(0) = (B * as[i]).hash(AES_BLK_SIZE);
+            sender_inputs.at(i).at(1) = ((B - As[i]) * as[i]).hash(AES_BLK_SIZE);
+        }
+
+#else
+    if (not cpu_has_avx(true))
        throw runtime_error("SimpleOT needs AVX support");

    int i, j, k;
@@ -179,6 +237,7 @@ void BaseOT::exec_base(bool new_receiver_inputs)
        printf("\n");
        #endif
    }
+#endif

    for (int i = 0; i < nOT; i++)
    {
--- a/OT/BitMatrix.h
+++ b/OT/BitMatrix.h
@@ -6,9 +6,9 @@
 #ifndef OT_BITMATRIX_H_
 #define OT_BITMATRIX_H_

+#include "Tools/intrinsics.h"
+
 #include <vector>
-#include <emmintrin.h>
-#include <immintrin.h>
 #include <iostream>

 using namespace std;
--- a/OT/OTExtension.cpp
+++ b/OT/OTExtension.cpp
@@ -5,8 +5,7 @@
 #include "Math/gf2n.h"
 #include "Tools/aes.h"
 #include "Tools/MMO.h"
-#include <wmmintrin.h>
-#include <emmintrin.h>
+#include "Tools/intrinsics.h"


 OTExtension::OTExtension(const BaseOT& baseOT, TwoPartyPlayer* player,
--- a/OT/square128.cpp
+++ b/OT/square128.cpp
@@ -3,13 +3,12 @@
 *
 */

-#include <smmintrin.h>
-#include <immintrin.h>
 #include <mpirxx.h>

 #include "BitMatrix.h"
 #include "Tools/random.h"
 #include "Tools/BitVector.h"
+#include "Tools/intrinsics.h"
 #include "Math/Square.h"

 union matrix16x8
--- a/Processor/Instruction.h
+++ b/Processor/Instruction.h
@@ -287,7 +287,6 @@ enum

 // Register types
 enum RegType {
-  MODP,
  INT,
  SBIT,
  CBIT,
--- a/Processor/Instruction.hpp
+++ b/Processor/Instruction.hpp
@@ -342,7 +342,6 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)

      // write to external client, input is : opcode num_args, client_id, message_type, var1, var2 ...
      case WRITESOCKETC:
-      case WRITESOCKETS:
      case WRITESOCKETSHARE:
      case WRITESOCKETINT:
        num_var_args = get_int(s) - 2;
@@ -350,6 +349,8 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
        r[1] = get_int(s);
        get_vector(num_var_args, start, s);
        break;
+      case WRITESOCKETS:
+        throw runtime_error("sending MACs to client not supported any more");
      case CONNECTIPV4:
        throw runtime_error("parties as clients not supported any more");
      case READCLIENTPUBLICKEY:
@@ -590,6 +591,7 @@ int BaseInstruction::get_reg_type() const
    case SHLCI:
    case SHRCI:
    case CONVINT:
+    case PUBINPUT:
      return CINT;
    default:
      if (is_gf2n_instruction())
@@ -1145,29 +1147,17 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
        // read shares and MAC shares
        Proc.read_socket_private(Proc.read_Ci(r[0]), start, true);
        break;
-      case GREADSOCKETS:
-        //Proc.get_S2_ref(r[0]).get_share().pack(socket_octetstream);
-        //Proc.get_S2_ref(r[0]).get_mac().pack(socket_octetstream);
-        break;
      case WRITESOCKETINT:
-        Proc.write_socket(INT, CLEAR, false, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(INT, Proc.read_Ci(r[0]), r[1], start);
        break;
      case WRITESOCKETC:
-        Proc.write_socket(MODP, CLEAR, false, Proc.read_Ci(r[0]), r[1], start);
-        break;
-      case WRITESOCKETS:
-        // Send shares + MACs
-        Proc.write_socket(MODP, SECRET, true, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(CINT, Proc.read_Ci(r[0]), r[1], start);
        break;
      case WRITESOCKETSHARE:
        // Send only shares, no MACs
        // N.B. doesn't make sense to have a corresponding read instruction for this
-        Proc.write_socket(MODP, SECRET, false, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(SINT, Proc.read_Ci(r[0]), r[1], start);
        break;
-      /*case GWRITESOCKETS:
-        Proc.get_S2_ref(r[0]).get_share().pack(socket_octetstream);
-        Proc.get_S2_ref(r[0]).get_mac().pack(socket_octetstream);
-        break;*/
      case WRITEFILESHARE:
        // Write shares to file system
        Proc.write_shares_to_file(start);
--- a/Processor/Machine.h
+++ b/Processor/Machine.h
@@ -45,6 +45,8 @@ class Machine : public BaseMachine
  // Keep record of used offline data
  DataPositions pos;

+  Player* P;
+
  void load_program(const string& threadname, const string& filename);

  public:
@@ -75,6 +77,7 @@ class Machine : public BaseMachine
      const string& memtype, int lg2, bool direct, int opening_sum,
      bool receive_threads, int max_broadcast, bool use_encryption, bool live_prep,
      OnlineOptions opts);
+  ~Machine();

  const Names& get_N() { return N; }

--- a/Processor/Machine.hpp
+++ b/Processor/Machine.hpp
@@ -49,7 +49,6 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
  // make directory for outputs if necessary
  mkdir_p(PREP_DIR);

-  Player* P;
  if (use_encryption)
    P = new CryptoPlayer(N, 0xF00);
  else
@@ -103,8 +102,6 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
      ot_setups.push_back({ *P, true });
  }

-  delete P;
-
  /* Set up the threads */
  tinfo.resize(nthreads);
  threads.resize(nthreads);
@@ -131,6 +128,12 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
    }
 }

+template<class sint, class sgf2n>
+Machine<sint, sgf2n>::~Machine()
+{
+  delete P;
+}
+
 template<class sint, class sgf2n>
 void Machine<sint, sgf2n>::load_program(const string& threadname,
    const string& filename)
@@ -318,7 +321,7 @@ void Machine<sint, sgf2n>::run()
  print_timers();
  cerr << "Data sent = " << data_sent / 1e6 << " MB" << endl;

-  PlainPlayer P(N, 0xFF00);
+  auto& P = *this->P;
  Bundle<octetStream> bundle(P);
  bundle.mine.store(data_sent.load());
  P.Broadcast_Receive_no_stats(bundle);
--- a/Processor/OfflineMachine.hpp
+++ b/Processor/OfflineMachine.hpp
@@ -34,6 +34,8 @@ template<class W>
 template<class T, class U>
 int OfflineMachine<W>::run()
 {
+    T::clear::init_default(this->online_opts.prime_length());
+    U::clear::init_field(U::clear::default_degree());
    T::bit_type::mac_key_type::init_field();
    auto binary_mac_key = read_generate_write_mac_key<typename T::bit_type>(P);
    GC::ShareThread<typename T::bit_type> thread(playerNames,
@@ -52,7 +54,6 @@ template<class W>
 template<class T>
 void OfflineMachine<W>::generate()
 {
-    T::clear::init_default(this->online_opts.prime_length());
    T::clear::next::template init<typename T::clear>(false);
    T::clear::template write_setup<T>(P.num_players());
    auto mac_key = read_generate_write_mac_key<T>(P);
--- a/Processor/OnlineMachine.hpp
+++ b/Processor/OnlineMachine.hpp
@@ -200,12 +200,10 @@ void OnlineMachine::start_networking()
    } else {
      if (not opt.get("-ext-server")->isSet)
      {
-        if (my_port != Names::DEFAULT_PORT)
-          throw runtime_error("cannot set port number when not using Server.x");
        if (nplayers == 0)
          opt.get("-N")->getInt(nplayers);
        server = Server::start_networking(playerNames, mynum, nplayers,
-            hostname, pnbase);
+            hostname, pnbase, my_port);
      }
      else
      {
--- a/Processor/Processor.h
+++ b/Processor/Processor.h
@@ -224,7 +224,7 @@ class Processor : public ArithmeticProcessor
  // Access to external client sockets for reading clear/shared data
  void read_socket_ints(int client_id, const vector<int>& registers);
  
-  void write_socket(const RegType reg_type, const SecrecyType secrecy_type, const bool send_macs,
+  void write_socket(const RegType reg_type,
                             int socket_id, int message_type, const vector<int>& registers);

  void read_socket_vector(int client_id, const vector<int>& registers);
--- a/Processor/Processor.hpp
+++ b/Processor/Processor.hpp
@@ -241,7 +241,7 @@ void Processor<sint, sgf2n>::split(const Instruction& instruction)
 // If message_type is > 0, send message_type in bytes 0 - 3, to allow an external client to
 //  determine the data structure being sent in a message.
 template<class sint, class sgf2n>
-void Processor<sint, sgf2n>::write_socket(const RegType reg_type, const SecrecyType secrecy_type, const bool send_macs,
+void Processor<sint, sgf2n>::write_socket(const RegType reg_type,
                             int socket_id, int message_type, const vector<int>& registers)
 {
  int m = registers.size();
@@ -254,26 +254,23 @@ void Processor<sint, sgf2n>::write_socket(const RegType reg_type, const SecrecyT

  for (int i = 0; i < m; i++)
  {
-    if (reg_type == MODP && secrecy_type == SECRET) {
-      // Send vector of secret shares and optionally macs
-      if (send_macs)
-        get_Sp_ref(registers[i]).pack(socket_stream);
-      else
-        get_Sp_ref(registers[i]).pack(socket_stream,
-            sint::get_rec_factor(P.my_num(), P.num_players()));
+    if (reg_type == SINT) {
+      // Send vector of secret shares
+      get_Sp_ref(registers[i]).pack(socket_stream,
+          sint::get_rec_factor(P.my_num(), P.num_players()));
    }
-    else if (reg_type == MODP && secrecy_type == CLEAR) {
+    else if (reg_type == CINT) {
      // Send vector of clear public field elements
      get_Cp_ref(registers[i]).pack(socket_stream);
    }
-    else if (reg_type == INT && secrecy_type == CLEAR) {
+    else if (reg_type == INT) {
      // Send vector of 32-bit clear ints
      socket_stream.store((int&)get_Ci_ref(registers[i]));
    } 
    else {
      stringstream ss;
      ss << "Write socket instruction with unknown reg type " << reg_type << 
-        " and secrecy type " << secrecy_type << "." << endl;      
+        "." << endl;
      throw Processor_Error(ss.str());
    }
  }
--- a/Programs/Source/benchmark_mobilenet.mpc
+++ b/Programs/Source/benchmark_mobilenet.mpc
@@ -1,4 +1,11 @@
-import ml
+import ml, sys
+
+if len(program.args) < 2:
+   print("You need to identify a concrete network such as 'v1_0.25_128'.",
+         file=sys.stderr)
+   print("Refer to https://github.com/anderspkd/SecureQ8 for scripts to run "
+         "this benchmark.", file=sys.stderr)
+   exit(1)

 network = program.args[1]

--- a/Programs/Source/gc_oram.mpc
+++ b/Programs/Source/gc_oram.mpc
@@ -3,7 +3,7 @@ prog = program
 from Compiler.GC.types import *
 from Compiler.GC.instructions import *

-bits.unit = 128
+bits.unit = 64

 program.to_merge = [ldmsdi, stmsdi, ldmsd, stmsd, stmsdci, xors, andrs]
 program.stop_class = type(None)
@@ -11,7 +11,7 @@ program.stop_class = type(None)
 from Compiler.circuit_oram import *
 from Compiler import circuit_oram

-from Compiler import oram
+import oram
 oram.n_threads = 1
 oram.n_threads_for_tree = 1

--- a/Protocols/FakeShare.h
+++ b/Protocols/FakeShare.h
@@ -58,16 +58,6 @@ public:
    {
    }

-    void add(T a, T b, int = 0, T = {})
-    {
-        *this = a + b;
-    }
-
-    void sub(T a, T b, int = 0, T = {})
-    {
-        *this = a - b;
-    }
-
    static void split(vector<bit_type>& dest, const vector<int>& regs,
            int n_bits, const This* source, int n_inputs,
            GC::FakeSecret::Protocol& protocol);
--- a/Protocols/MAC_Check.h
+++ b/Protocols/MAC_Check.h
@@ -161,15 +161,6 @@ public:
  void exchange(const Player& P);
 };

-template <class T>
-class Passing_MAC_Check : public Direct_MAC_Check<T>
-{
-public:
-  Passing_MAC_Check(const typename T::mac_key_type::Scalar& ai);
-
-  void exchange(const Player& P);
-};
-

 enum mc_timer { SEND, RECV_ADD, BCAST, RECV_SUM, SEED, COMMIT, WAIT_SUMMER, RECV, SUM, SELECT, MAX_TIMER };

--- a/Protocols/MAC_Check.hpp
+++ b/Protocols/MAC_Check.hpp
@@ -390,25 +390,6 @@ void Direct_MAC_Check<T>::exchange(const Player& P)
  this->CheckIfNeeded(P);
 }

-template<class T>
-Passing_MAC_Check<T>::Passing_MAC_Check(const typename T::mac_key_type::Scalar& ai) :
-    Direct_MAC_Check<T>(ai)
-{
-}
-
-template<class T>
-void passing_add_openings(vector<T>& values, octetStream& os)
-{
-  octetStream new_os;
-  for (unsigned int i=0; i<values.size(); i++)
-    {
-      T tmp;
-      tmp.unpack(os);
-      (tmp + values[i]).pack(new_os);
-    }
-  os = new_os;
-}
-
 template<class T>
 void Direct_MAC_Check<T>::init_open(const Player& P, int n)
 {
@@ -422,20 +403,4 @@ void Direct_MAC_Check<T>::prepare_open(const T& secret)
  this->macs.push_back(secret.get_mac());
 }

-template<class T>
-void Passing_MAC_Check<T>::exchange(const Player& P)
-{
-  this->pre_exchange(P);
-  for (int i = 0; i < P.num_players() - 1; i++)
-    {
-      P.pass_around(this->os);
-      passing_add_openings(this->values, this->os);
-    }
-  for (auto& x : this->values)
-    x.unpack(this->os);
-  this->AddToValues(this->values);
-  this->popen_cnt += this->values.size();
-  this->CheckIfNeeded(P);
-}
-
 #endif
--- a/Protocols/MalRepRingPrep.hpp
+++ b/Protocols/MalRepRingPrep.hpp
@@ -125,7 +125,6 @@ template<class U>
 void ShuffleSacrifice::shuffle(vector<U>& check_triples, Player& P)
 {
    int buffer_size = check_triples.size();
-    assert(buffer_size >= minimum_n_inputs());

    // shuffle
    GlobalPRNG G(P);
@@ -137,13 +136,24 @@ void ShuffleSacrifice::shuffle(vector<U>& check_triples, Player& P)
    }
 }

+template<class T>
+TripleShuffleSacrifice<T>::TripleShuffleSacrifice()
+{
+}
+
+template<class T>
+TripleShuffleSacrifice<T>::TripleShuffleSacrifice(int B, int C) :
+        ShuffleSacrifice(B, C)
+{
+}
+
 template<class T>
 void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
        vector<array<T, 3>>& check_triples, Player& P,
        typename T::MAC_Check& MC, ThreadQueues* queues)
 {
    int buffer_size = check_triples.size();
-    int N = (buffer_size - C) / B;
+    size_t N = (buffer_size - C) / B;

    shuffle(check_triples, P);

@@ -161,7 +171,9 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
        if (typename T::clear(opened[3 * i] * opened[3 * i + 1]) != opened[3 * i + 2])
            throw Offline_Check_Error("shuffle opening");

-    triples.resize(N);
+    // triples might be same as check_triples
+    if (triples.size() < N)
+        triples.resize(N);

    if (queues)
    {
@@ -172,6 +184,8 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
    }
    else
        triple_sacrifice(triples, check_triples, P, MC, 0, N);
+
+    triples.resize(N);
 }

 template<class T>
@@ -188,6 +202,7 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
    int N = buffer_size / B;
    int size = end - begin;
    masked.reserve(2 * size);
+    assert(size_t(end * B) <= check_triples.size());
    for (int i = begin; i < end; i++)
    {
        T& a = check_triples[i][0];
--- a/Protocols/ReplicatedPrep.hpp
+++ b/Protocols/ReplicatedPrep.hpp
@@ -326,7 +326,8 @@ void buffer_bits_spec(ReplicatedPrep<T<gfp_<X, L>>>& prep, vector<T<gfp_<X, L>>>
    typename T<gfp_<X, L>>::Protocol& prot)
 {
    (void) bits, (void) prot;
-    if (prot.get_n_relevant_players() > 10)
+    if (prot.get_n_relevant_players() > 10
+            or OnlineOptions::singleton.bits_from_squares)
        buffer_bits_from_squares(prep);
    else
        prep.ReplicatedRingPrep<T<gfp_<X, L>>>::buffer_bits();
--- a/Protocols/ShamirShare.h
+++ b/Protocols/ShamirShare.h
@@ -101,15 +101,6 @@ public:
        T::assign(buffer);
    }

-    void add(const ShamirShare& x, const ShamirShare& y)
-    {
-        *this = x + y;
-    }
-    void sub(const ShamirShare& x, const ShamirShare& y)
-    {
-        *this = x - y;
-    }
-
    void add(const ShamirShare& S, const clear aa, int my_num,
            const T& alphai)
    {
--- a/Protocols/Share.h
+++ b/Protocols/Share.h
@@ -18,7 +18,6 @@ template<class T> class Share;

 template<class T> class MAC_Check_;
 template<class T> class Direct_MAC_Check;
-template<class T> class Passing_MAC_Check;
 template<class T> class MascotMultiplier;
 template<class T> class MascotFieldPrep;
 template<class T> class MascotTripleGenerator;
--- a/Protocols/ShuffleSacrifice.h
+++ b/Protocols/ShuffleSacrifice.h
@@ -27,6 +27,7 @@ public:
    const int C;

    ShuffleSacrifice();
+    ShuffleSacrifice(int B, int C);

    int minimum_n_inputs(int n_outputs = 1)
    {
@@ -56,6 +57,9 @@ template<class T>
 class TripleShuffleSacrifice : public ShuffleSacrifice
 {
 public:
+    TripleShuffleSacrifice();
+    TripleShuffleSacrifice(int B, int C);
+
    void triple_sacrifice(vector<array<T, 3>>& triples,
            vector<array<T, 3>>& check_triples, Player& P,
            typename T::MAC_Check& MC, ThreadQueues* queues = 0);
--- a/Protocols/ShuffleSacrifice.hpp
+++ b/Protocols/ShuffleSacrifice.hpp
@@ -19,6 +19,12 @@ ShuffleSacrifice::ShuffleSacrifice() :
 {
 }

+inline
+ShuffleSacrifice::ShuffleSacrifice(int B, int C) :
+        B(B), C(C)
+{
+}
+
 template<class T>
 void TripleShuffleSacrifice<T>::triple_combine(vector<array<T, 3> >& triples,
        vector<array<T, 3> >& to_combine, Player& P,
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ parties and malicious security.
 On Linux, this requires a working toolchain and [all
 requirements](#requirements). On Ubuntu, the following might suffice:
 ```
-apt-get install automake build-essential git libboost-dev libboost-thread-dev libsodium-dev libssl-dev libtool m4 python texinfo yasm
+apt-get install automake build-essential git libboost-dev libboost-thread-dev libntl-dev libsodium-dev libssl-dev libtool m4 python3 texinfo yasm
 ```
 On MacOS, this requires [brew](https://brew.sh) to be installed,
 which will be used for all dependencies.
@@ -77,13 +77,74 @@ The following table lists all protocols that are fully supported.
 | Malicious, dishonest majority | [MASCOT / LowGear / HighGear](#secret-sharing) | [SPDZ2k](#secret-sharing) | [Tiny / Tinier](#secret-sharing) | [BMR](#bmr) |
 | Covert, dishonest majority | [CowGear / ChaiGear](#secret-sharing) | N/A | N/A | N/A |
 | Semi-honest, dishonest majority | [Semi / Hemi / Soho](#secret-sharing) | [Semi2k](#secret-sharing) | [SemiBin](#secret-sharing) | [Yao's GC](#yaos-garbled-circuits) / [BMR](#bmr) |
-| Malicious, honest majority | [Shamir / Rep3 / PS / SY](#honest-majority) | [Brain / Rep[34] / PS / SY](#honest-majority) | [Rep3 / CCD](#honest-majority) | [BMR](#bmr) |
+| Malicious, honest majority | [Shamir / Rep3 / PS / SY](#honest-majority) | [Brain / Rep[34] / PS / SY](#honest-majority) | [Rep3 / CCD / PS](#honest-majority) | [BMR](#bmr) |
 | Semi-honest, honest majority | [Shamir / Rep3](#honest-majority) | [Rep3](#honest-majority) | [Rep3 / CCD](#honest-majority) | [BMR](#bmr) |

 See [this paper](https://eprint.iacr.org/2020/300) for an explanation
 of the various security models and high-level introduction to
 multi-party computation.

+##### Finding the most efficient protocol
+
+Lower security requirements generally allow for more efficient
+protocols. Within the same security model (line in the table above),
+there are a few things to consider:
+
+- Computation domain: Arithmetic protocols (modulo prime or power of
+  two) are preferable for many applications because they offer integer
+  addition and multiplication at low cost. However, binary circuits
+  might a better option if there is very little integer
+  computation. [See below](#finding-the-most-efficient-variant) to
+  find the most efficient mixed-circuit variant.  Furthermore, local
+  computation modulo a power of two is cheaper, but MP-SPDZ does not
+  offer this domain with homomorphic encryption.
+
+- Secret sharing vs garbled circuits: Computation using secret sharing
+  requires a number of communication rounds that grows depending on
+  the computation, which is not the case for garbled
+  circuits. However, the cost of integer computation as a binary
+  circuit often offset this. MP-SPDZ only offers garbled circuit
+  with binary computation.
+
+- Underlying technology for dishonest majority: While secret sharing
+  alone suffice honest-majority computation, dishonest majority
+  requires either homomorphic encryption (HE) or oblivious transfer
+  (OT). The two options offer a computation-communication trade-off:
+  While OT is easier to compute, HE requires less
+  communication. Furthermore, the latter requires a certain of
+  batching to be efficient, which makes OT preferable for smaller
+  tasks.
+
+- Malicious, honest-majority three-party computation: A number of
+  protocols are available for this setting, but SY/SPDZ-wise is the
+  most efficient one for a number of reasons: It requires the lowest
+  communication, and it is the only one offering constant-communication
+  dot products.
+
+- Minor variants: Some command-line options change aspects of the
+  protocols such as:
+
+  - `--bucket-size`: In some malicious binary computation and
+    malicious edaBit generation, a smaller bucket size allows
+    preprocessing in smaller batches at a higher asymptotic cost.
+
+  - `--batch-size`: Preprocessing in smaller batches avoids generating
+    too much but larger batches save communication rounds.
+
+  - `--direct`: In dishonest-majority protocols, direct communication
+    instead of star-shaped saves communication rounds at the expense
+    of a quadratic amount. This might be beneficial with a small
+    number of parties.
+
+  - `--bits-from-squares`: In some protocols computing modulo a prime
+    (Shamir, Rep3, SPDZ-wise), this switches from generating random
+    bits via XOR of parties' inputs to generation using the root of a
+    random square.
+
+  - `--top-gear`: In protocols with malicious security using
+    homomorphic encryption, this reduces the memory usage and batch
+    size for preprocessing.
+
 #### Paper and Citation

 The design of MP-SPDZ is described in [this
@@ -151,13 +212,16 @@ phase outputs the amount of offline material required, which allows to
 compute the preprocessing time for a particular computation.

 #### Requirements
- - GCC 5 or later (tested with up to 10) or LLVM/clang 5 or later (tested with up to 11). We recommend clang because it performs better.
+
+ - GCC 5 or later (tested with up to 10) or LLVM/clang 5 or later
+   (only x86; tested with up to 11). For x86, we recommend clang
+   because it performs better.
 - MPIR library, compiled with C++ support (use flag `--enable-cxx` when running configure). You can use `make -j8 tldr` to install it locally.
 - libsodium library, tested against 1.0.16
 - OpenSSL, tested against 1.1.1
 - Boost.Asio with SSL support (`libboost-dev` on Ubuntu), tested against 1.65
 - Boost.Thread for BMR (`libboost-thread-dev` on Ubuntu), tested against 1.65
- - 64-bit CPU
+ - x86 or ARM 64-bit CPU (the latter tested with AWS Gravitron)
 - Python 3.5 or later
 - NTL library for homomorphic encryption (optional; tested with NTL 10.5)
 - If using macOS, Sierra or later
@@ -168,13 +232,14 @@ compute the preprocessing time for a particular computation.

 - By default, the binaries are optimized for the CPU you are
   compiling on.
-   For all optimizations, a CPU supporting AES-NI, PCLMUL, AVX2, BMI2, ADX is
+   For all optimizations on x86, a CPU supporting AES-NI, PCLMUL, AVX2, BMI2, ADX is
   required. This includes mainstream processors released 2014 or later.
   If you intend to run on a different CPU than compiling, you might
   need to change the `ARCH` variable in `CONFIG` or `CONFIG.mine` to
   `-march=<cpu>`. See the [GCC
   documentation](https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html)
-   for the possible options.
+   for the possible options. To run OT-based protocols on x86 without AVX,
+   add `AVX_OT = 0` in addition.
 - To benchmark online-only protocols or Overdrive offline phases, add the following line at the top: `MY_CFLAGS = -DINSECURE`
 - `PREP_DIR` should point to a local, unversioned directory to store preprocessing data (the default is `Player-Data` in the current directory).
 - For homomorphic encryption, set `USE_NTL = 1`.
@@ -299,6 +364,19 @@ compiler where `n` is the number of parties for the standard variant
 and 2 for the special
 variant by Mohassel and Rindal (available in Rep3 only).

+##### Finding the most efficient variant
+
+Where available, local share conversion is likely the most efficient
+variant. Protocols based on Shamir secret sharing are unlikely to
+benefit from mixed-circuit computation because they use an extension
+field for binary computation. Otherwise, edaBits likely offer an
+asymptotic benefit. However, malicious protocols by default generate
+large batches of edaBits (more than one million at once), which is
+only worthwhile for accordingly large computation. For smaller
+computation, try running the virtual machines with `-B 4` or `-B 5`,
+which reduces the batch size to ~10,000 and ~1,000, respectively, at a
+higher asymptotic cost.
+
 #### Bristol Fashion circuits

 Bristol Fashion is the name of a description format of binary circuits
@@ -386,7 +464,8 @@ This runs the compiled bytecode in cleartext computation.

 Some full implementations require oblivious transfer, which is
 implemented as OT extension based on
-https://github.com/mkskeller/SimpleOT.
+https://github.com/mkskeller/SimpleOT or OpenSSL (activate the
+latter with `AVX_OT = 0` in `CONFIG` or `CONFIG.mine`).

 ### Secret sharing

@@ -524,6 +603,7 @@ The following table shows all programs for honest-majority computation:
 | `rep4-ring-party.x` | Replicated | Mod 2^k | Y | 4 | `rep4-ring.sh` |
 | `replicated-bin-party.x` | Replicated | Binary | N | 3 | `replicated.sh` |
 | `malicious-rep-bin-party.x` | Replicated | Binary | Y | 3 | `mal-rep-bin.sh` |
+| `ps-rep-bin-party.x` | Replicated | Binary | Y | 3 | `ps-rep-bin.sh` |
 | `replicated-field-party.x` | Replicated | Mod prime | N | 3 | `rep-field.sh` |
 | `ps-rep-field-party.x` | Replicated | Mod prime | Y | 3 | `ps-rep-field.sh` |
 | `sy-rep-field-party.x` | SPDZ-wise replicated | Mod prime | Y | 3 | `sy-rep-field.sh` |
@@ -537,7 +617,7 @@ The following table shows all programs for honest-majority computation:
 We use the "generate random triple optimistically/sacrifice/Beaver"
 methodology described by [Lindell and
 Nof](https://eprint.iacr.org/2017/816) to achieve malicious
-security with plain replicated secret sharing,
+security with plain arithmetic replicated secret sharing,
 except for the "PS" (post-sacrifice) protocols where the
 actual multiplication is executed optimistically and checked later as
 also described by Lindell and Nof.
@@ -563,6 +643,13 @@ secret value and information-theoretic tag similar to SPDZ but not
 with additive secret sharing, hence the name.
 Rep4 refers to the four-party protocol by [Dalskov et
 al.](https://eprint.iacr.org/2020/1330).
+`malicious-rep-bin-party.x` is based on cut-and-choose triple
+generation by [Furukawa et al.](https://eprint.iacr.org/2016/944) but
+using Beaver multiplication instead of their post-sacrifice
+approach. `ps-rep-bin-party.x` is based on the post-sacrifice approach
+by [Araki et
+al.](https://www.ieee-security.org/TC/SP2017/papers/96.pdf) but
+without using their cache optimization.

 All protocols in this section require encrypted channels because the
 information received by the honest majority suffices the reconstruct
--- a/Scripts/bmr-program-run.sh
+++ b/Scripts/bmr-program-run.sh
@@ -35,5 +35,7 @@ done
 $prefix ./bmr-program-tparty.x $prog $netmap 2>&1 &> bmr-log/t &
 for i in $(seq $[n_players-1]); do
    $prefix ./bmr-program-party.x $i $prog $netmap $threshold 2>&1 &> bmr-log/$i &
+    id=$!
 done
 $prefix ./bmr-program-party.x $n_players $prog $netmap $threshold 2>&1 | tee bmr-log/$n_players
+wait $id
--- a/Scripts/ccd.sh
+++ b/Scripts/ccd.sh
@@ -3,7 +3,7 @@
 HERE=$(cd `dirname $0`; pwd)
 SPDZROOT=$HERE/..

-export PLAYERS=3
+export PLAYERS=${PLAYERS:-3}

 . $HERE/run-common.sh

--- a/Scripts/mal-ccd.sh
+++ b/Scripts/mal-ccd.sh
@@ -3,7 +3,7 @@
 HERE=$(cd `dirname $0`; pwd)
 SPDZROOT=$HERE/..

-export PLAYERS=3
+export PLAYERS=${PLAYERS:-3}

 . $HERE/run-common.sh

--- a/Scripts/ps-rep-bin.sh
+++ b/Scripts/ps-rep-bin.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+HERE=$(cd `dirname $0`; pwd)
+SPDZROOT=$HERE/..
+
+export PLAYERS=3
+
+. $HERE/run-common.sh
+
+run_player ps-rep-bin-party.x $* || exit 1
--- a/Scripts/test_tutorial.sh
+++ b/Scripts/test_tutorial.sh
@@ -7,6 +7,8 @@ while getopts XYC opt; do
 	   ;;
 	Y) dabit=2
 	   ;;
+	C) cont=1
+	   ;;
    esac
 done

@@ -31,7 +33,7 @@ function test_vm
 	    echo == Party $i
 	    cat logs/tutorial-$i
 	done
-	exit 1
+	test -z $cont && exit 1
    fi
 }

@@ -88,7 +90,7 @@ fi

 ./compile.py -B 16  $compile_opts tutorial

-for i in replicated mal-rep-bin semi-bin ccd mal-ccd; do
+for i in replicated mal-rep-bin ps-rep-bin semi-bin ccd mal-ccd; do
    test_vm $i $run_opts
 done

--- a/Tools/BitVector.cpp
+++ b/Tools/BitVector.cpp
@@ -41,7 +41,7 @@ BitVector BitVector::operator &(const BitVector& other) const

 bool BitVector::parity() const
 {
-#if defined(__SSE4_2__) or not defined(__clang__)
+#if (defined(__SSE4_2__) or not defined(__clang__)) and defined(__x86_64__)
    bool res = 0;
    for (size_t i = 0; i < size_bytes() / 8; i++)
        res ^= _popcnt64(((word*)bytes)[i]) & 1;
@@ -49,7 +49,17 @@ bool BitVector::parity() const
        res ^= _popcnt32(bytes[i]) & 1;
    return res;
 #else
-    throw runtime_error("need to compile with SSE4.2 support or GCC");
+    bool res = 0;
+    for (size_t i = 0; i < size_bytes() / 8; i++)
+    {
+        word x = ((word*)bytes)[i];
+        for (int i = 5; i >= 0; i--)
+            x ^= (x >> (1 << i));
+        res ^= (x & 1);
+    }
+    for (size_t i = size_bytes() / 8 * 8; i < size_bytes(); i++)
+        res ^= (*this)[i];
+    return res;
 #endif
 }

@@ -131,12 +141,19 @@ void BitVector::input(istream& s,bool human)

 void BitVector::pack(octetStream& o) const
 {
-    o.store(nbytes);
+    o.store_int(nbits, 8);
    o.append((octet*)bytes, nbytes);
 }

 void BitVector::unpack(octetStream& o)
 {
-    o.get(nbytes);
+    resize(o.get_int(8));
    o.consume((octet*)bytes, nbytes);
 }
+
+BitVector& BitVector::operator =(const octetStream other)
+{
+    resize(other.get_length() * 8);
+    memcpy(bytes, other.get_data(), nbytes);
+    return *this;
+}
--- a/Tools/BitVector.h
+++ b/Tools/BitVector.h
@@ -7,7 +7,6 @@
 #include <vector>
 using namespace std;
 #include <stdlib.h>
-#include <pmmintrin.h>
 #include <assert.h>

 #include "Tools/Exceptions.h"
@@ -15,6 +14,7 @@ using namespace std;
 // just for util functions
 #include "Math/gf2nlong.h"
 #include "Math/FixedVec.h"
+#include "Tools/intrinsics.h"

 class PRNG;
 class octetStream;
@@ -137,6 +137,8 @@ class BitVector
        return *this;
    }

+    BitVector& operator=(const octetStream other);
+
    void swap(BitVector& other)
    {
        std::swap(nbits, other.nbits);
@@ -156,7 +158,7 @@ class BitVector
        void operator=(const Access& other) { *this = other.get(); }
        void operator^=(const Access& other) { *this = get() ^ other.get(); }
        bool operator==(const Access& other) const { return get() == other.get(); }
-        bool operator==(bool b) const { return get() == b; }
+        operator bool() const { return get(); }
    };

    bool operator[](int i) const { return get_bit(i); }
@@ -242,6 +244,11 @@ class BitVector
        return true;
    }

+    bool operator==(const BitVector& other)
+    {
+        return equals(other);
+    }
+
    void append(const BitVector& other, size_t length);

    void randomize(PRNG& G);
--- a/Tools/aes-arm.h
+++ b/Tools/aes-arm.h
@@ -0,0 +1,328 @@
+// This file is reduced to functionality necessary for AES in order to avoid
+// conflicts with simde.
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&   \
+    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
+     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
--- a/Show More
+++ b/Show More