diff --git a/.gitmodules b/.gitmodules
index 193b3677..455a5514 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "Programs/Circuits"]
 	path = Programs/Circuits
 	url = https://github.com/mkskeller/bristol-fashion
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde
diff --git a/BMR/Key.h b/BMR/Key.h
index f3e1fb01..6ee48794 100644
--- a/BMR/Key.h
+++ b/BMR/Key.h
@@ -7,11 +7,10 @@
 #define COMMON_INC_KEY_H_
 
 #include <iostream>
-#include <emmintrin.h>
-#include <smmintrin.h>
 #include <string.h>
 
 #include "Tools/FlexBuffer.h"
+#include "Tools/intrinsics.h"
 #include "Math/gf2nlong.h"
 
 using namespace std;
diff --git a/BMR/Party.cpp b/BMR/Party.cpp
index a7275c0d..5ca1360a 100644
--- a/BMR/Party.cpp
+++ b/BMR/Party.cpp
@@ -371,7 +371,7 @@ void FakeProgramParty::receive_spdz_wires(ReceivedMsg& msg)
 		spdz_mac_key.unpack(spdz_wires[op].back());
 		if (!MC)
 		{
-			MC = new Passing_MAC_Check<Share<gf2n_long>>(spdz_mac_key);
+			MC = new MAC_Check_<Share<gf2n_long>>(spdz_mac_key);
 			cout << "MAC key: " << hex << spdz_mac_key << endl;
 			mac_key = spdz_mac_key;
 		}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca31b200..fe65cc49 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 The changelog explains changes pulled through from the private development repository. Bug fixes and small enhancements are committed between releases and not documented here.
 
+## 0.2.4 (Apr 19, 2021)
+
+- ARM support
+- Base OTs optionally without SimpleOT/AVX
+- Use OpenSSL instead of Crypto++ for elliptic curves
+- Post-sacrifice binary computation with replicated secret sharing similar
+  to [Araki et al.](https://www.ieee-security.org/TC/SP2017/papers/96.pdf)
+- More flexible multithreading
+
 ## 0.2.3 (Feb 23, 2021)
 
 - Distributed key generation for homomorphic encryption with active security similar to [Rotaru et al.](https://eprint.iacr.org/2019/1300)
diff --git a/CONFIG b/CONFIG
index 9348d851..54370858 100644
--- a/CONFIG
+++ b/CONFIG
@@ -3,7 +3,6 @@ ROOT = .
 OPTIM= -O3
 #PROF = -pg
 #DEBUG = -DDEBUG
-#MEMPROTECT = -DMEMPROTECT
 GDEBUG = -g
 
 # set this to your preferred local storage directory
@@ -12,8 +11,8 @@ PREP_DIR = '-DPREP_DIR="Player-Data/"'
 # set for SHE preprocessing (SPDZ and Overdrive)
 USE_NTL = 0
 
-# set for using GF(2^128) online phase, OT, MASCOT, or BMR
-# unset for GF(2^40) online and offline phase
+# set for using GF(2^128)
+# unset for GF(2^40)
 USE_GF2N_LONG = 1
 
 # set to -march=<architecture> for optimization
@@ -28,6 +27,24 @@ USE_GF2N_LONG = 1
 ARCH = -mtune=native -msse4.1 -msse4.2 -maes -mpclmul -mavx -mavx2 -mbmi2 -madx
 ARCH = -march=native
 
+MACHINE := $(shell uname -m)
+OS := $(shell uname -s)
+ifeq ($(MACHINE), x86_64)
+# set this to 0 to avoid using AVX for OT
+ifeq ($(OS), Linux)
+CHECK_AVX := $(shell grep -q avx /proc/cpuinfo; echo $$?)
+ifeq ($(CHECK_AVX), 0)
+AVX_OT = 1
+else
+AVX_OT = 0
+endif
+else
+AVX_OT = 1
+endif
+else
+AVX_OT = 0
+endif
+
 # allow to set compiler in CONFIG.mine
 CXX = g++
 
@@ -38,6 +55,10 @@ ifeq ($(USE_GF2N_LONG),1)
 GF2N_LONG = -DUSE_GF2N_LONG
 endif
 
+ifeq ($(AVX_OT), 0)
+CFLAGS += -DNO_AVX_OT
+endif
+
 # MAX_MOD_SZ (for FHE) must be least and GFP_MOD_SZ (for computation)
 # must be exactly ceil(len(p)/len(word)) for the relevant prime p
 # GFP_MOD_SZ only needs to be set for primes of bit length more that 256.
@@ -51,7 +72,6 @@ ifeq ($(USE_NTL),1)
 LDLIBS := -lntl $(LDLIBS)
 endif
 
-OS := $(shell uname -s)
 ifeq ($(OS), Linux)
 LDLIBS += -lrt
 endif
@@ -62,12 +82,10 @@ else
 BOOST = -lboost_thread $(MY_BOOST)
 endif
 
-CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(MEMPROTECT) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
+CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
 CPPFLAGS = $(CFLAGS)
 LD = $(CXX)
 
-ECLIB = -lcryptopp
-
 ifeq ($(OS), Darwin)
 ifeq ($(USE_NTL),1)
 CFLAGS += -Wno-error=unused-parameter
diff --git a/Compiler/GC/types.py b/Compiler/GC/types.py
index 4a413469..51ed2aac 100644
--- a/Compiler/GC/types.py
+++ b/Compiler/GC/types.py
@@ -284,7 +284,7 @@ class sbits(bits):
     Instances can be also be initalized from :py:obj:`~Compiler.types.regint`
     and :py:obj:`~Compiler.types.sint`.
     """
-    max_length = 128
+    max_length = 64
     reg_type = 'sb'
     is_clear = False
     clear_type = cbits
diff --git a/Compiler/comparison.py b/Compiler/comparison.py
index 7dfa3673..977bb4b2 100644
--- a/Compiler/comparison.py
+++ b/Compiler/comparison.py
@@ -190,6 +190,8 @@ def TruncLeakyInRing(a, k, m, signed):
     Returns a >> m.
     Requires a < 2^k and leaks a % 2^m (needs to be constant or random).
     """
+    if k == m:
+        return 0
     assert k > m
     assert int(program.options.ring) >= k
     from .types import sint, intbitint, cint, cgf2n
diff --git a/Compiler/dijkstra.py b/Compiler/dijkstra.py
index 61a65247..45d25e6b 100644
--- a/Compiler/dijkstra.py
+++ b/Compiler/dijkstra.py
@@ -103,7 +103,7 @@ class HeapQ(object):
         childpos = MemValue(start * shift)
         @for_range(self.levels - 1)
         def f(i):
-            parentpos = childpos.right_shift(1, self.levels)
+            parentpos = childpos.right_shift(1, self.levels + 1)
             parent, parent_state = self.heap.read_and_maybe_remove(parentpos)
             child, child_state = self.heap.read_and_maybe_remove(childpos)
             swap = parent > child
diff --git a/Compiler/floatingpoint.py b/Compiler/floatingpoint.py
index 570d5459..f2b47e88 100644
--- a/Compiler/floatingpoint.py
+++ b/Compiler/floatingpoint.py
@@ -1,3 +1,4 @@
+import math
 from math import log, floor, ceil
 from Compiler.instructions import *
 from . import types
@@ -411,6 +412,8 @@ def TruncInRing(to_shift, l, pow2m):
     return types.sint.bit_compose(reversed(bits))
 
 def SplitInRing(a, l, m):
+    if l == 1:
+        return m.if_else(a, 0), m.if_else(0, a), 1
     pow2m = Pow2(m, l, None)
     upper = TruncInRing(a, l, pow2m)
     lower = a - upper * pow2m
@@ -620,27 +623,36 @@ def BITLT(a, b, bit_length):
 def BitDecFull(a):
     from .library import get_program, do_while, if_, break_point
     from .types import sint, regint, longint
-    p=int(get_program().options.prime)
+    p = get_program().prime
     assert p
     bit_length = p.bit_length()
-    bbits = [sint(size=a.size) for i in range(bit_length)]
-    tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
-    pbits = util.bit_decompose(p)
-    # Loop until we get some random integers less than p
-    done = [regint(0) for i in range(a.size)]
-    @do_while
-    def get_bits_loop():
+    logp = int(round(math.log(p, 2)))
+    if abs(p - 2 ** logp) / p < 2 ** -get_program().security:
+        # inspired by Rabbit (https://eprint.iacr.org/2021/119)
+        # no need for exact randomness generation
+        # if modulo a power of two is close enough
+        bbits = [sint.get_random_bit(size=a.size) for i in range(logp)]
+        if logp != bit_length:
+            bbits += [sint(0, size=a.size)]
+    else:
+        bbits = [sint(size=a.size) for i in range(bit_length)]
+        tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
+        pbits = util.bit_decompose(p)
+        # Loop until we get some random integers less than p
+        done = [regint(0) for i in range(a.size)]
+        @do_while
+        def get_bits_loop():
+            for j in range(a.size):
+                @if_(done[j] == 0)
+                def _():
+                    for i in range(bit_length):
+                        tbits[j][i].link(sint.get_random_bit())
+                    c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
+                    done[j].link(c)
+            return (sum(done) != a.size)
         for j in range(a.size):
-            @if_(done[j] == 0)
-            def _():
-                for i in range(bit_length):
-                    tbits[j][i].link(sint.get_random_bit())
-                c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
-                done[j].link(c)
-        return (sum(done) != a.size)
-    for j in range(a.size):
-        for i in range(bit_length):
-            movs(bbits[i][j], tbits[j][i])
+            for i in range(bit_length):
+                movs(bbits[i][j], tbits[j][i])
     b = sint.bit_compose(bbits)
     c = (a-b).reveal()
     t = (p-c).bit_decompose(bit_length)
diff --git a/Compiler/instructions.py b/Compiler/instructions.py
index 75ae760c..693a6aeb 100644
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -1577,19 +1577,6 @@ class writesocketc(base.IOInstruction):
     def has_var_args(self):
         return True
 
-@base.vectorize
-class writesockets(base.IOInstruction):
-    """
-    Write a variable number of secret shares + MACs from registers into a socket
-    for a specified client id, message_type
-    """
-    __slots__ = []
-    code = base.opcodes['WRITESOCKETS']
-    arg_format = tools.chain(['ci', 'int'], itertools.repeat('s'))
-
-    def has_var_args(self):
-        return True
-
 @base.vectorize
 class writesocketshare(base.IOInstruction):
     """ Write a variable number of shares (without MACs) from secret
diff --git a/Compiler/instructions_base.py b/Compiler/instructions_base.py
index b5e071c7..09c477a9 100644
--- a/Compiler/instructions_base.py
+++ b/Compiler/instructions_base.py
@@ -903,7 +903,7 @@ class DirectMemoryWriteInstruction(DirectMemoryInstruction, \
                                        WriteMemoryInstruction):
     __slots__ = []
     def __init__(self, *args, **kwargs):
-        if program.curr_tape.prevent_direct_memory_write:
+        if not program.curr_tape.singular:
             raise CompilerError('Direct memory writing prevented in threads')
         super(DirectMemoryWriteInstruction, self).__init__(*args, **kwargs)
 
diff --git a/Compiler/library.py b/Compiler/library.py
index 811448f0..6df9e6d3 100644
--- a/Compiler/library.py
+++ b/Compiler/library.py
@@ -1062,14 +1062,14 @@ def for_range_opt_multithread(n_threads, n_loops):
     """
     return for_range_multithread(n_threads, None, n_loops)
 
-def multithread(n_threads, n_items, max_size=None):
+def multithread(n_threads, n_items=None, max_size=None):
     """
     Distribute the computation of :py:obj:`n_items` to
     :py:obj:`n_threads` threads, but leave the in-thread repetition up
     to the user.
 
     :param n_threads: compile-time (int)
-    :param n_items: regint/cint/int
+    :param n_items: regint/cint/int (default: :py:obj:`n_threads`)
 
     The following executes ``f(0, 8)``, ``f(8, 8)``, and
     ``f(16, 9)`` in three different threads:
@@ -1080,6 +1080,8 @@ def multithread(n_threads, n_items, max_size=None):
         def f(base, size):
             ...
     """
+    if n_items is None:
+        n_items = n_threads
     if max_size is None:
         return map_reduce(n_threads, None, n_items, initializer=lambda: [],
                           reducer=None, looping=False)
diff --git a/Compiler/ml.py b/Compiler/ml.py
index 1ae4bd4d..2fb115d9 100644
--- a/Compiler/ml.py
+++ b/Compiler/ml.py
@@ -703,6 +703,9 @@ class Dense(DenseBase):
         progress('f input')
 
     def forward(self, batch=None):
+        if batch is None:
+            batch = regint.Array(self.N)
+            batch.assign(regint.inc(self.N))
         self.compute_f_input(batch=batch)
         if self.activation_layer:
             self.activation_layer.forward(batch)
diff --git a/Compiler/oram.py b/Compiler/oram.py
index 7ca4cb54..77d55f78 100644
--- a/Compiler/oram.py
+++ b/Compiler/oram.py
@@ -91,7 +91,11 @@ class intBlock(Block):
             for length,start in zip(self.lengths[:-1],series(self.lengths)):
                 res.append(remainder.mod2m(length, total_length - start, False))
                 remainder -= res[-1]
-                remainder /= floatingpoint.two_power(length)
+                if Program.prog.options.ring:
+                    remainder = remainder.trunc_zeros(length,
+                                                      total_length - start, False)
+                else:
+                    remainder /= floatingpoint.two_power(length)
             res.append(remainder)
             return res
     def set_slice(self, value):
@@ -1498,12 +1502,12 @@ class PackedIndexStructure(object):
             rem = mod2m(index, self.log_entries_per_block, log2(self.size), False)
             c = mod2m(rem, self.log_entries_per_element, \
                           self.log_entries_per_block, False)
-            b = (rem - c).trunc_zeros(self.log_entries_per_element,
+            b = trunc_zeros(rem - c, self.log_entries_per_element,
                                       self.log_entries_per_block)
             if self.small:
                 return 0, b, c
             else:
-                return (index - rem).trunc_zeros(self.log_entries_per_block,
+                return trunc_zeros(index - rem, self.log_entries_per_block,
                                                  log2(self.size)), b, c
         else:
             index_bits = bit_decompose(index, log2(self.size))
diff --git a/Compiler/program.py b/Compiler/program.py
index 68fea852..74e03a1b 100644
--- a/Compiler/program.py
+++ b/Compiler/program.py
@@ -118,7 +118,6 @@ class Program(object):
         self.req_num = None
         self.tape_stack = []
         self.n_threads = 1
-        self.free_threads = set()
         self.public_input_file = None
         self.types = {}
         self.budget = int(self.options.budget)
@@ -206,6 +205,28 @@ class Program(object):
         self.progname = progname
 
     def new_tape(self, function, args=[], name=None, single_thread=False):
+        """
+        Create a new tape from a function. See
+        :py:func:`~Compiler.library.multithread` and
+        :py:func:`~Compiler.library.for_range_opt_multithread` for
+        easier-to-use higher-level functionality. The following runs
+        two threads defined by two different functions::
+
+            def f():
+                ...
+            def g():
+                ...
+            tapes = [program.new_tape(x) for x in (f, g)]
+            thread_numbers = program.run_tapes(tapes)
+            program.join_tapes(threads_numbers)
+
+        :param function: Python function defining the thread
+        :param args: arguments to the function
+        :param name: name used for files
+        :param single_thread: Boolean indicating whether tape will never be run in parallel to itself
+        :returns: tape handle
+
+        """
         if name is None:
             name = function.__name__
         name = "%s-%s" % (self.name, name)
@@ -214,7 +235,7 @@ class Program(object):
         tape_index = len(self.tapes)
         self.tape_stack.append(self.curr_tape)
         self.curr_tape = Tape(name, self)
-        self.curr_tape.prevent_direct_memory_write = not single_thread
+        self.curr_tape.singular = single_thread
         self.tapes.append(self.curr_tape)
         function(*args)
         self.finalize_tape(self.curr_tape)
@@ -226,14 +247,31 @@ class Program(object):
         return self.run_tapes([[tape_index, arg]])[0]
 
     def run_tapes(self, args):
-        if self.curr_tape is not self.tapes[0]:
+        """ Run tapes in parallel. See :py:func:`new_tape` for an example.
+
+        :param args: list of tape handles or tuples of tape handle and extra argument (for :py:func:`~Compiler.library.get_arg`)
+        :returns: list of thread numbers
+        """
+        if not self.curr_tape.singular:
             raise CompilerError('Compiler does not support ' \
                                     'recursive spawning of threads')
+        args = [list(util.tuplify(arg)) for arg in args]
+        singular_tapes = set()
+        for arg in args:
+            if self.tapes[arg[0]].singular:
+                if arg[0] in singular_tapes:
+                    raise CompilerError('cannot run singular tape in parallel')
+                singular_tapes.add(arg[0])
+            assert len(arg)
+            assert len(arg) <= 2
+            if len(arg) == 1:
+                arg += [0]
         thread_numbers = []
         while len(thread_numbers) < len(args):
-            if self.free_threads:
-                thread_numbers.append(min(self.free_threads))
-                self.free_threads.remove(thread_numbers[-1])
+            free_threads = self.curr_tape.free_threads
+            if free_threads:
+                thread_numbers.append(min(free_threads))
+                free_threads.remove(thread_numbers[-1])
             else:
                 thread_numbers.append(self.n_threads)
                 self.n_threads += 1
@@ -247,10 +285,18 @@ class Program(object):
         return thread_numbers
 
     def join_tape(self, thread_number):
+        self.join_tapes([thread_number])
+
+    def join_tapes(self, thread_numbers):
+        """ Wait for completion of tapes.  See :py:func:`new_tape` for an example.
+
+        :param thread_numbers: list of thread numbers
+        """
         self.curr_tape.start_new_basicblock(name='pre-join_tape')
-        Compiler.instructions.join_tape(thread_number)
+        for thread_number in thread_numbers:
+            Compiler.instructions.join_tape(thread_number)
+            self.curr_tape.free_threads.add(thread_number)
         self.curr_tape.start_new_basicblock(name='post-join_tape')
-        self.free_threads.add(thread_number)
 
     def update_req(self, tape):
         if self.req_num is None:
@@ -259,6 +305,7 @@ class Program(object):
             self.req_num += tape.req_num
     
     def write_bytes(self):
+
         """ Write all non-empty threads and schedule to files. """
 
         nonempty_tapes = [t for t in self.tapes]
@@ -312,7 +359,7 @@ class Program(object):
         """ Allocate memory from the top """
         if not isinstance(size, int):
             raise CompilerError('size must be known at compile time')
-        if (creator_tape or self.curr_tape) != self.tapes[0]:
+        if not (creator_tape or self.curr_tape).singular:
             raise CompilerError('cannot allocate memory outside main thread')
         if size == 0:
             return
@@ -510,7 +557,8 @@ class Tape:
         self.req_bit_length = defaultdict(lambda: 0)
         self.function_basicblocks = {}
         self.functions = []
-        self.prevent_direct_memory_write = False
+        self.singular = True
+        self.free_threads = set()
 
     class BasicBlock(object):
         def __init__(self, parent, name, scope, exit_condition=None):
diff --git a/Compiler/types.py b/Compiler/types.py
index efcf738c..0255499c 100644
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -21,7 +21,13 @@ Basic types
 -----------
 
 Basic types contain many special methods such as :py:func:`__add__`. This is
-used for operator overloading in Python. In some operations such as
+used for operator overloading in Python. It is not recommend to use
+them, use the plain operators instead, such as ``+`` instead of
+:py:func:`__add__`. See
+https://docs.python.org/3/reference/datamodel.html#special-method-names
+for a translation to operators.
+
+In some operations such as
 secure comparison, the secure computation protocols allows for more
 parameters than just the operands which influence the performance. In
 this case, we provide an alias for better code readability. For
@@ -780,7 +786,12 @@ class cint(_clear, _int):
 
     @vectorized_classmethod
     def read_from_socket(cls, client_id, n=1):
-        """ Read a list of clear values from socket. """
+        """ Receive clear value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: cint (if n=1) or list of cint
+        """
         res = [cls() for i in range(n)]
         readsocketc(client_id, *res)
         if n == 1:
@@ -790,7 +801,11 @@ class cint(_clear, _int):
 
     @vectorized_classmethod
     def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of clear values to socket """
+        """ Send a list of clear values to a client.
+
+        :param client_id: Client id (regint)
+        :param values: list of cint
+        """
         writesocketc(client_id, message_type, *values)
 
     @vectorized_classmethod
@@ -1207,7 +1222,12 @@ class regint(_register, _int):
 
     @vectorized_classmethod
     def read_from_socket(cls, client_id, n=1):
-        """ Receive n register values from socket """
+        """ Receive clear integer value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: regint (if n=1) or list of regint
+        """
         res = [cls() for i in range(n)]
         readsocketint(client_id, *res)
         if n == 1:
@@ -1217,7 +1237,11 @@ class regint(_register, _int):
 
     @vectorized_classmethod
     def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of integers to socket """
+        """ Send a list of clear integers to a client.
+
+        :param client_id: Client id (regint)
+        :param values: list of regint
+        """
         writesocketint(client_id, message_type, *values)
 
     @vectorize_init
@@ -1805,6 +1829,14 @@ class sint(_secret, _int):
     PreOR = staticmethod(floatingpoint.PreOR)
     get_type = staticmethod(lambda n: sint)
 
+    @staticmethod
+    def require_bit_length(n_bits):
+        if program.options.ring:
+            if int(program.options.ring) < n_bits:
+                raise CompilerError('computation modulus too small')
+        else:
+            program.curr_tape.require_bit_length(n_bits)
+
     @vectorized_classmethod
     def get_random_int(cls, bits):
         """ Secret random n-bit number according to security model.
@@ -1906,7 +1938,12 @@ class sint(_secret, _int):
 
     @vectorized_classmethod
     def read_from_socket(cls, client_id, n=1):
-        """ Receive n shares and MAC shares from socket """
+        """ Receive secret-shared value(s) from client.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: sint (if n=1) or list of sint
+        """
         res = [cls() for i in range(n)]
         readsockets(client_id, *res)
         if n == 1:
@@ -1914,27 +1951,46 @@ class sint(_secret, _int):
         else:
             return res
 
-    @vectorized_classmethod
-    def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of shares and MAC shares to socket """
-        writesockets(client_id, message_type, *values)
-
     @vectorize
     def write_share_to_socket(self, client_id, message_type=ClientMessageType.NoType):
         """ Send only share to socket """
         writesocketshare(client_id, message_type, self)
 
     @vectorized_classmethod
-    def write_shares_to_socket(cls, client_id, values, message_type=ClientMessageType.NoType, include_macs=False):
+    def write_shares_to_socket(cls, client_id, values,
+                               message_type=ClientMessageType.NoType):
         """ Send shares of a list of values to a specified client socket.
 
         :param client_id: regint
         :param values: list of sint
         """
-        if include_macs:
-            writesockets(client_id, message_type, *values)
-        else:
-            writesocketshare(client_id, message_type, *values)
+        writesocketshare(client_id, message_type, *values)
+
+    @classmethod
+    def read_from_file(cls, start, n_items):
+        """ Read shares from ``Persistence/Transactions-P<playerno>.data``.
+
+        :param start: starting position in number of shares from beginning (int/regint/cint)
+        :param n_items: number of items (int)
+        :returns: destination for final position, -1 for eof reached, or -2 for file not found (regint)
+        :returns: list of shares
+        """
+        shares = [cls(size=1) for i in range(n_items)]
+        stop = regint()
+        readsharesfromfile(regint.conv(start), stop, *shares)
+        return stop, shares
+
+    @staticmethod
+    def write_to_file(shares):
+        """ Write shares to ``Persistence/Transactions-P<playerno>.data``
+        (appending at the end).
+
+        :param: shares (list or iterable of sint)
+        """
+        for share in shares:
+            assert isinstance(share, sint)
+            assert share.size == 1
+        writesharestofile(*shares)
 
     @vectorized_classmethod
     def load_mem(cls, address, mem_type=None):
@@ -2920,8 +2976,14 @@ class cfix(_number, _structure):
 
     @vectorized_classmethod
     def read_from_socket(cls, client_id, n=1):
-        """ Read one or more cfix values from a socket. 
-            Sender will have already bit shifted and sent as cints."""
+        """
+        Receive clear fixed-point value(s) from client. The client needs
+        to convert the values to the right integer representation.
+
+        :param client_id: Client id (regint)
+        :param n: number of values (default 1)
+        :returns: cfix (if n=1) or list of cfix
+        """
         cint_input = cint.read_from_socket(client_id, n)
         if n == 1:
             return cfix._new(cint_inputs)
@@ -2930,7 +2992,12 @@ class cfix(_number, _structure):
         
     @vectorized_classmethod
     def write_to_socket(self, client_id, values, message_type=ClientMessageType.NoType):
-        """ Send a list of cfix values to socket. Values are sent as bit shifted cints. """
+        """ Send a list of clear fixed-point values to a client
+        (represented as clear integers).
+
+        :param client_id: Client id (regint)
+        :param values: list of cint
+        """
         def cfix_to_cint(fix_val):
             return cint(fix_val.v)
         cint_values = list(map(cfix_to_cint, values))
@@ -3182,15 +3249,8 @@ class cfix(_number, _structure):
 
     def print_plain(self):
         """ Clear fixed-point output. """
-        if self.k > 64:
-            sign = 1 - (((self.v + (1 << (self.k - 1))) >> (self.k - 1)) & 1)
-        else:
-            tmp = regint()
-            convmodp(tmp, self.v, bitlength=self.k)
-            sign = cint(tmp < 0)
-        abs_v = sign.if_else(-self.v, self.v)
-        print_float_plain(cint(abs_v), cint(-self.f), \
-                          cint(0), cint(sign), cint(0))
+        print_float_plain(cint.conv(self.v), cint(-self.f), \
+                          cint(0), cint(0), cint(0))
 
     def output_if(self, cond):
         cond_print_plain(cond, self.v, cint(-self.f))
@@ -3206,8 +3266,14 @@ class _single(_number, _structure):
 
     @classmethod
     def receive_from_client(cls, n, client_id, message_type=ClientMessageType.NoType):
-        """ Securely obtain shares of n values input by a client.
-            Assumes client has already run bit shift to convert fixed point to integer."""
+        """
+        Securely obtain shares of values input by a client. Assumes client
+        has already converted values to integer representation.
+
+        :param n: number of inputs (int)
+        :param client_id: regint
+
+        """
         sint_inputs = cls.int_type.receive_from_client(n, client_id, ClientMessageType.TripleShares)
         return list(map(cls, sint_inputs))
 
@@ -3574,6 +3640,7 @@ class sfix(_fix):
         """ Secret fixed-point input.
 
         :param player: public (regint/cint/int) """
+        cls.int_type.require_bit_length(cls.k)
         v = cls.int_type()
         inputmixed('fix', v, cls.f, player)
         return cls._new(v)
@@ -4486,7 +4553,7 @@ class Array(object):
                 raise CompilerError('cannot assign vector to all elements')
         mem_value = MemValue(value)
         self.address = MemValue.if_necessary(self.address)
-        n_threads = 8 if use_threads and len(self) > 2**20 else 1
+        n_threads = 8 if use_threads and len(self) > 2**20 else None
         @library.for_range_multithread(n_threads, 1024, len(self))
         def f(i):
             self[i] = mem_value
diff --git a/Compiler/util.py b/Compiler/util.py
index d586d61f..fa41f41c 100644
--- a/Compiler/util.py
+++ b/Compiler/util.py
@@ -40,6 +40,12 @@ def mod2m(a, b, bits, signed):
     else:
         return a.mod2m(b, bits, signed=signed)
 
+def trunc_zeros(a, n_zeros, bit_length=None):
+    if isinstance(a, int):
+        return a >> n_zeros
+    else:
+        return a.trunc_zeros(n_zeros, bit_length)
+
 def right_shift(a, b, bits):
     if isinstance(a, int):
         return a >> b
diff --git a/ECDSA/CurveElement.cpp b/ECDSA/CurveElement.cpp
new file mode 100644
index 00000000..c272c5e2
--- /dev/null
+++ b/ECDSA/CurveElement.cpp
@@ -0,0 +1,142 @@
+/*
+ * Element.cpp
+ *
+ */
+
+#include <ECDSA/CurveElement.h>
+
+#include "Math/gfp.hpp"
+
+unsigned char CurveElement::zero[crypto_core_ristretto255_BYTES];
+
+void CurveElement::init()
+{
+    Scalar::init_field(
+            (bigint(1) << 252) + bigint("27742317777372353535851937790883648493"),
+            false);
+    if(sodium_init() == -1)
+        throw runtime_error("cannot initalize sodium");
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    memset(tmp, 0, sizeof(tmp));
+    crypto_scalarmult_ristretto255_base(zero, tmp);
+}
+
+void CurveElement::convert(unsigned char* res, const Scalar& other)
+{
+    bigint tmp;
+    tmp = other;
+    assert(tmp.__get_mp()->_mp_size * sizeof(mp_limb_t) <= crypto_core_ristretto255_SCALARBYTES);
+    memset(res, 0, crypto_core_ristretto255_SCALARBYTES);
+    memcpy(res, tmp.__get_mp()->_mp_d, abs(tmp.__get_mp()->_mp_size) * sizeof(mp_limb_t));
+}
+
+CurveElement::CurveElement()
+{
+    memcpy(a, zero, sizeof(a));
+    check();
+}
+
+CurveElement::CurveElement(const Scalar& other)
+{
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    convert(tmp, other);
+    crypto_scalarmult_ristretto255_base(a, tmp);
+    check();
+}
+
+CurveElement::CurveElement(word other)
+{
+    if (other == 0)
+    {
+        *this = CurveElement();
+        return;
+    }
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    memset(tmp, 0, sizeof(tmp));
+    memcpy(tmp, &other, sizeof(other));
+    crypto_scalarmult_ristretto255_base(a, tmp);
+    check();
+}
+
+void CurveElement::check()
+{
+#ifdef CURVE_CHECK
+    if (crypto_core_ristretto255_is_valid_point(a) != 1)
+        throw runtime_error("curve point not valid");
+#endif
+}
+
+CurveElement CurveElement::operator +(const CurveElement& other) const
+{
+    CurveElement res;
+    crypto_core_ristretto255_add(res.a, a, other.a);
+    res.check();
+    return res;
+}
+
+CurveElement CurveElement::operator -(const CurveElement& other) const
+{
+    CurveElement res;
+    crypto_core_ristretto255_sub(res.a, a, other.a);
+    res.check();
+    return res;
+}
+
+CurveElement CurveElement::operator *(const Scalar& other) const
+{
+    CurveElement res;
+    unsigned char tmp[crypto_core_ristretto255_SCALARBYTES];
+    convert(tmp, other);
+    if (crypto_scalarmult_ristretto255(res.a, tmp, a) < 0)
+    {
+        cerr << "EC multiplication by zero" << endl;
+    }
+    res.check();
+    return res;
+}
+
+CurveElement& CurveElement::operator +=(const CurveElement& other)
+{
+    *this = *this + other;
+    return *this;
+}
+
+bool CurveElement::operator ==(const CurveElement& other) const
+{
+    for (size_t i = 0; i < sizeof a; i++)
+        if (a[i] != other.a[i])
+            return false;
+    return true;
+}
+
+bool CurveElement::operator !=(const CurveElement& other) const
+{
+    return not (*this == other);
+}
+
+void CurveElement::pack(octetStream& os) const
+{
+    os.append(a, sizeof(a));
+}
+
+void CurveElement::unpack(octetStream& os)
+{
+    os.consume(a, sizeof(a));
+    check();
+}
+
+ostream& operator <<(ostream& s, const CurveElement& x)
+{
+    s << hex << *(word*)x.get();
+    return s;
+}
+
+octetStream CurveElement::hash(size_t n_bytes) const
+{
+    octetStream os;
+    pack(os);
+    auto res = os.hash();
+    assert(n_bytes >= res.get_length());
+    res.resize_precise(n_bytes);
+    return res;
+}
diff --git a/ECDSA/CurveElement.h b/ECDSA/CurveElement.h
new file mode 100644
index 00000000..254271e1
--- /dev/null
+++ b/ECDSA/CurveElement.h
@@ -0,0 +1,63 @@
+/*
+ * Element.h
+ *
+ */
+
+#ifndef ECDSA_CURVEELEMENT_H_
+#define ECDSA_CURVEELEMENT_H_
+
+#include <sodium.h>
+
+#include "Math/gfp.h"
+
+class CurveElement : public ValueInterface
+{
+public:
+    typedef gfp_<2, 4> Scalar;
+
+private:
+    static unsigned char zero[crypto_core_ristretto255_BYTES];
+
+    unsigned char a[crypto_core_ristretto255_BYTES];
+
+    static void convert(unsigned char* res, const Scalar& other);
+
+public:
+    typedef void next;
+    typedef void Square;
+
+    static int size() { return sizeof(a); }
+    static string type_string() { return "Curve25519"; }
+
+    static void init();
+
+    CurveElement();
+    CurveElement(const Scalar& other);
+    CurveElement(word other);
+
+    void check();
+
+    const unsigned char* get() const { return a; }
+
+    CurveElement operator+(const CurveElement& other) const;
+    CurveElement operator-(const CurveElement& other) const;
+    CurveElement operator*(const Scalar& other) const;
+
+    CurveElement& operator+=(const CurveElement& other);
+
+    bool operator==(const CurveElement& other) const;
+    bool operator!=(const CurveElement& other) const;
+
+    void assign_zero() { *this = 0; }
+    bool is_zero() { return *this == 0; }
+    void add(octetStream& os) { *this += os.get<CurveElement>(); }
+
+    void pack(octetStream& os) const;
+    void unpack(octetStream& os);
+
+    octetStream hash(size_t n_bytes) const;
+};
+
+ostream& operator<<(ostream& s, const CurveElement& x);
+
+#endif /* ECDSA_CURVEELEMENT_H_ */
diff --git a/ECDSA/P256Element.cpp b/ECDSA/P256Element.cpp
index 4a4c2e38..8437f39d 100644
--- a/ECDSA/P256Element.cpp
+++ b/ECDSA/P256Element.cpp
@@ -7,72 +7,127 @@
 
 #include "Math/gfp.hpp"
 
-#include <cryptopp/oids.h>
-#include <cryptopp/misc.h>
-
-CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP> P256Element::params;
-CryptoPP::ECP P256Element::curve;
+EC_GROUP* P256Element::curve;
 
 void P256Element::init()
 {
-    params = CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP>(CryptoPP::ASN1::secp256k1());
-    curve = params.GetCurve();
-    auto mod = params.GetSubgroupOrder();
-    Scalar::init_field(CryptoPP::IntToString(mod).c_str(), false);
-}
-
-CryptoPP::Integer P256Element::convert(const Scalar& other)
-{
-    return CryptoPP::Integer((unsigned char*) other.get_ptr(), other.size(),
-            CryptoPP::Integer::UNSIGNED, CryptoPP::LITTLE_ENDIAN_ORDER);
+    curve = EC_GROUP_new_by_curve_name(NID_secp256k1);
+    assert(curve != 0);
+    auto modulus = EC_GROUP_get0_order(curve);
+    Scalar::init_field(BN_bn2dec(modulus), false);
 }
 
 P256Element::P256Element()
 {
-    point = curve.Identity();
+    point = EC_POINT_new(curve);
+    assert(point != 0);
+    assert(EC_POINT_set_to_infinity(curve, point) != 0);
 }
 
-P256Element::P256Element(const Scalar& other)
+P256Element::P256Element(const Scalar& other) :
+        P256Element()
 {
-    point = params.ExponentiateBase(convert(other));
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, bigint(other).get_str().c_str());
+    assert(EC_POINTs_mul(curve, point, exp, 0, 0, 0, 0) != 0);
+    BN_free(exp);
 }
 
-P256Element::P256Element(word other)
+P256Element::P256Element(word other) :
+        P256Element()
 {
-    point = params.ExponentiateBase(other);
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, to_string(other).c_str());
+    assert(EC_POINTs_mul(curve, point, exp, 0, 0, 0, 0) != 0);
+    BN_free(exp);
+}
+
+P256Element& P256Element::operator =(const P256Element& other)
+{
+    assert(EC_POINT_copy(point, other.point) != 0);
+    return *this;
 }
 
 void P256Element::check()
 {
-    curve.VerifyPoint(point);
+    assert(EC_POINT_is_on_curve(curve, point, 0) == 1);
 }
 
 P256Element::Scalar P256Element::x() const
 {
-    return bigint(IntToString(point.x));
+    BIGNUM* x = BN_new();
+    assert(EC_POINT_get_affine_coordinates_GFp(curve, point, x, 0, 0) != 0);
+    char* xx = BN_bn2dec(x);
+    Scalar res((bigint(xx)));
+    OPENSSL_free(xx);
+    BN_free(x);
+    return res;
 }
 
 P256Element P256Element::operator +(const P256Element& other) const
 {
     P256Element res;
-    res.point = curve.Add(point, other.point);
+    assert(EC_POINT_add(curve, res.point, point, other.point, 0) != 0);
     return res;
 }
 
 P256Element P256Element::operator -(const P256Element& other) const
 {
-    P256Element res;
-    res.point = curve.Add(point, curve.Inverse(other.point));
-    return res;
+    P256Element tmp = other;
+    assert(EC_POINT_invert(curve, tmp.point, 0) != 0);
+    return *this + tmp;
 }
 
 P256Element P256Element::operator *(const Scalar& other) const
 {
     P256Element res;
-    res.point = curve.Multiply(convert(other), point);
+    BIGNUM* exp = BN_new();
+    BN_dec2bn(&exp, bigint(other).get_str().c_str());
+    assert(EC_POINT_mul(curve, res.point, 0, point, exp, 0) != 0);
+    BN_free(exp);
     return res;
 }
 
+bool P256Element::operator ==(const P256Element& other) const
+{
+    int cmp = EC_POINT_cmp(curve, point, other.point, 0);
+    assert(cmp == 0 or cmp == 1);
+    return not cmp;
+}
+
+void P256Element::pack(octetStream& os) const
+{
+    octet* buffer;
+    size_t length = EC_POINT_point2buf(curve, point,
+            POINT_CONVERSION_COMPRESSED, &buffer, 0);
+    assert(length != 0);
+    os.store_int(length, 8);
+    os.append(buffer, length);
+}
+
+void P256Element::unpack(octetStream& os)
+{
+    size_t length = os.get_int(8);
+    assert(
+            EC_POINT_oct2point(curve, point, os.consume(length), length, 0)
+                    != 0);
+}
+
+ostream& operator <<(ostream& s, const P256Element& x)
+{
+    char* hex = EC_POINT_point2hex(x.curve, x.point,
+            POINT_CONVERSION_COMPRESSED, 0);
+    s << hex;
+    OPENSSL_free(hex);
+    return s;
+}
+
+P256Element::P256Element(const P256Element& other) :
+        P256Element()
+{
+    *this = other;
+}
+
 P256Element operator*(const P256Element::Scalar& x, const P256Element& y)
 {
     return y * x;
@@ -90,44 +145,17 @@ P256Element& P256Element::operator /=(const Scalar& other)
     return *this;
 }
 
-bool P256Element::operator ==(const P256Element& other) const
-{
-    return point == other.point;
-}
-
 bool P256Element::operator !=(const P256Element& other) const
 {
     return not (*this == other);
 }
 
-void P256Element::pack(octetStream& os) const
+octetStream P256Element::hash(size_t n_bytes) const
 {
-    os.serialize(point.identity);
-    size_t l;
-    l = point.x.MinEncodedSize();
-    os.serialize(l);
-    point.x.Encode(os.append(l), l);
-    l = point.y.MinEncodedSize();
-    os.serialize(l);
-    point.y.Encode(os.append(l), l);
-}
-
-void P256Element::unpack(octetStream& os)
-{
-    os.unserialize(point.identity);
-    size_t l;
-    os.unserialize(l);
-    point.x.Decode(os.consume(l), l);
-    os.unserialize(l);
-    point.y.Decode(os.consume(l), l);
-}
-
-ostream& operator <<(ostream& s, const P256Element& x)
-{
-    auto& point = x.get();
-    if (point.identity)
-        s << "ID" << endl;
-    else
-        s << point.x << "," << point.y;
-    return s;
+    octetStream os;
+    pack(os);
+    auto res = os.hash();
+    assert(n_bytes >= res.get_length());
+    res.resize_precise(n_bytes);
+    return res;
 }
diff --git a/ECDSA/P256Element.h b/ECDSA/P256Element.h
index 6263c49f..e426bade 100644
--- a/ECDSA/P256Element.h
+++ b/ECDSA/P256Element.h
@@ -6,7 +6,8 @@
 #ifndef ECDSA_P256ELEMENT_H_
 #define ECDSA_P256ELEMENT_H_
 
-#include <cryptopp/eccrypto.h>
+#include <openssl/ec.h>
+#include <openssl/obj_mac.h>
 
 #include "Math/gfp.h"
 
@@ -16,12 +17,9 @@ public:
     typedef gfp_<2, 4> Scalar;
 
 private:
-    static CryptoPP::DL_GroupParameters_EC<CryptoPP::ECP> params;
-    static CryptoPP::ECP curve;
+    static EC_GROUP* curve;
 
-    CryptoPP::ECP::Point point;
-
-    static CryptoPP::Integer convert(const Scalar& other);
+    EC_POINT* point;
 
 public:
     typedef void next;
@@ -35,13 +33,13 @@ public:
     static void init();
 
     P256Element();
+    P256Element(const P256Element& other);
     P256Element(const Scalar& other);
     P256Element(word other);
 
-    void check();
+    P256Element& operator=(const P256Element& other);
 
-    const CryptoPP::ECP::Point& get() const { return point; }
-//    const unsigned char* get() const { return a; }
+    void check();
 
     Scalar x() const;
 
@@ -55,16 +53,18 @@ public:
     bool operator==(const P256Element& other) const;
     bool operator!=(const P256Element& other) const;
 
-    void assign_zero() { *this = 0; }
-    bool is_zero() { return *this == 0; }
+    void assign_zero() { *this = {}; }
+    bool is_zero() { return *this == P256Element(); }
     void add(octetStream& os) { *this += os.get<P256Element>(); }
 
     void pack(octetStream& os) const;
     void unpack(octetStream& os);
+
+    octetStream hash(size_t n_bytes) const;
+
+    friend ostream& operator<<(ostream& s, const P256Element& x);
 };
 
 P256Element operator*(const P256Element::Scalar& x, const P256Element& y);
 
-ostream& operator<<(ostream& s, const P256Element& x);
-
 #endif /* ECDSA_P256ELEMENT_H_ */
diff --git a/ECDSA/README.md b/ECDSA/README.md
index 5d1db349..6307a91e 100644
--- a/ECDSA/README.md
+++ b/ECDSA/README.md
@@ -5,9 +5,6 @@ in `preprocessing.hpp` and `sign.hpp`, respectively.
 
 #### Compilation
 
-- Add either `CXX = clang++` or `OPTIM = -O2` because GCC 8 or later with `-O3` will produce a segfault when using `mascot-ecdsa-party.x`
-- For older hardware, also add `ARCH = -march=native`
-- Install [Crypto++](https://www.cryptopp.com) (`libcrypto++-dev` on Ubuntu). We used version 5.6.4, which is the default on Ubuntu 18.04.
 - Compile the binaries: `make -j8 ecdsa`
 - Or compile the static binaries: `make -j8 ecdsa-static`
 
diff --git a/FHE/AddableVector.cpp b/FHE/AddableVector.cpp
index 9bf09c36..a99c0593 100644
--- a/FHE/AddableVector.cpp
+++ b/FHE/AddableVector.cpp
@@ -35,5 +35,5 @@ AddableVector<T> AddableVector<T>::mul_by_X_i(int j,
 }
 
 template
-AddableVector<fixint<0>> AddableVector<fixint<0>>::mul_by_X_i(int j,
-        const FHE_PK& pk) const;
+AddableVector<Int_Random_Coins::rand_type> AddableVector<
+        Int_Random_Coins::rand_type>::mul_by_X_i(int j, const FHE_PK& pk) const;
diff --git a/FHE/FHE_Keys.cpp b/FHE/FHE_Keys.cpp
index 564184dd..990a0d20 100644
--- a/FHE/FHE_Keys.cpp
+++ b/FHE/FHE_Keys.cpp
@@ -114,7 +114,9 @@ void FHE_PK::check_noise(const Rq_Element& x, bool check_modulo) const
       noise[i] /= pr;
       m = m > noise[i] ? m : noise[i];
     }
+#ifdef VERBOSE_KEYGEN
   cerr << "max noise: " << m << endl;
+#endif
 }
 
 
diff --git a/FHE/NTL-Subs.cpp b/FHE/NTL-Subs.cpp
index 67bb551b..3fa3cc86 100644
--- a/FHE/NTL-Subs.cpp
+++ b/FHE/NTL-Subs.cpp
@@ -516,7 +516,9 @@ void init(P2Data& P2D,const Ring& Rg)
   
   imatrix A;
   A.resize(Rg.phi_m(), imatrix::value_type(Gord*gf2n_short::degree()));
-  P2D.A.resize(A[0].size(), imatrix::value_type(A.size()));
+  P2D.A.resize(A[0].size());
+  for (auto& x : P2D.A)
+    x.resize(A.size());
   for (int slot=0; slot<Gord; slot++)
     { for (int co=0; co<gf2n_short::degree(); co++)
         { // Work out how x^co in given slot maps to plaintext vector
diff --git a/FHE/Random_Coins.h b/FHE/Random_Coins.h
index e6be91b8..65f6c3cf 100644
--- a/FHE/Random_Coins.h
+++ b/FHE/Random_Coins.h
@@ -9,7 +9,11 @@
 
 class FHE_PK;
 
-class Int_Random_Coins : public AddableMatrix<fixint<0>>
+#ifndef N_LIMBS_RAND
+#define N_LIMBS_RAND 0
+#endif
+
+class Int_Random_Coins : public AddableMatrix<fixint<N_LIMBS_RAND>>
 {
   typedef value_type::value_type T;
 
diff --git a/FHEOffline/DistKeyGen.cpp b/FHEOffline/DistKeyGen.cpp
index fb2e7cbc..255ec451 100644
--- a/FHEOffline/DistKeyGen.cpp
+++ b/FHEOffline/DistKeyGen.cpp
@@ -109,7 +109,6 @@ DistKeyGen::DistKeyGen(const FHE_Params& params, const bigint& p) :
  */
 void DistKeyGen::Gen_Random_Data(PRNG& G)
 {
-    cout << "In Gen Random Data " << endl;
     secret.from_vec(params.sampleHwt(G));
     rc1.generate(G);
     rc2.generate(G);
@@ -228,7 +227,9 @@ void check_randomness(vector<octetStream>& seeds,
     // Re-create the randomness from these seeds
     for (int i = 0; i < num_players; i++)
       { G.SetSeed(seeds[i].get_data());
+#ifdef VERBOSE_KEYGEN
         cout << "\tSeed for player " << i << " is..." << seeds[i] << endl;
+#endif
         playerKeys[i].Gen_Random_Data(G);
         globalKey += playerKeys[i];
       }
@@ -292,22 +293,27 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
 {
   const FHE_Params& params=pk.get_params();
 
+#ifdef VERBOSE_KEYGEN
   double start,stop;
   /***********************
    *       Step 1        *
    ***********************/
   start=clock();
+#endif
 
   // First compute and commit to the challenge value
   vector<unsigned int> e(P.num_players());
   vector<octetStream> Comm_e(P.num_players());
   vector<octetStream> Open_e(P.num_players());
   Commit_To_Challenge(e,Comm_e,Open_e,P,num_runs);
+
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 1 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 2        *
@@ -319,11 +325,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   vector<PRNG> G(num_runs);
   Commit_To_Seeds(G,seeds,Comm_seeds,Open_seeds,P,num_runs);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 2 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 2.5      *
@@ -340,28 +348,27 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
       keys[i].Gen_Random_Data(G[i]);
       a[i][P.my_num()] = keys[i].a;
     }
-  cout << "Generated Random Vals" << endl;
 
   if (commit)
     {
       // Do Commit and Open to Get a
       Commit_And_Open(a,P,num_runs);
-      cout << "Finished Commit and Open" << endl;
     }
   else
     {
       Transmit_Data(a,P,num_runs);
-      cout << "Finished open" << endl;
     }
   for (int i=0; i<num_runs; i++)
     keys[i].sum_a(a[i]);
 
   a.clear();
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 2.5 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 3        *
@@ -373,11 +380,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
       b[i][P.my_num()] = keys[i].b;
     }
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 3 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 4        *
@@ -387,11 +396,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   else
     Transmit_Data(b,P,num_runs);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 4 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *     Step 5/6        *
@@ -404,11 +415,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
     }
 
   b.clear();
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 5/6 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 7        *
@@ -418,11 +431,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   else
     Transmit_Data(enc_dash,P,num_runs);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 7 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *    Step 8/9/10      *
@@ -434,11 +449,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
       enc[i][P.my_num()] = keys[i].enc;
    }
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 8/9/10 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 11       *
@@ -448,11 +465,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   else
     Transmit_Data(enc,P,num_runs);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 11 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *      Step 12        *
@@ -460,11 +479,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   for (int i=0; i<num_runs; i++)
     keys[i].sum_enc(enc[i]);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 12 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *     Step 13/14      *
@@ -472,11 +493,13 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
 
   int challenge=Open_Challenge(e,Open_e,Comm_e,P,num_runs);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 13/14 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   /***********************
    *       Step 15       *
@@ -489,7 +512,10 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   /* Now Open All Bar The Challenge Run */
   for (int i = 0; i < num_runs; i++)
     { if (i != challenge)
-        { cout << "Checking run " << i << endl;
+        {
+#ifdef VERBOSE_KEYGEN
+          cout << "Checking run " << i << endl;
+#endif
           check_randomness(seeds[i], keys[i].enc, keys[i].pk, keys[i].enc_dash, P.num_players());
         }
     }
@@ -497,15 +523,19 @@ void Run_Gen_Protocol(FHE_PK& pk,FHE_SK& sk,const Player& P,int num_runs,
   // Set the key to the chosen run's output
   keys[challenge].finalize(pk, sk);
 
+#ifdef VERBOSE_KEYGEN
   cout << "Done Step 15 " << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
   start=clock();
+#endif
 
   P.Check_Broadcast();
+#ifdef VERBOSE_KEYGEN
   cout << "Broadcast check all passed" << endl;
 
   stop=clock();
   cout << "\t\tTime = " << (stop-start)/CLOCKS_PER_SEC << " seconds " << endl;
+#endif
 }
diff --git a/FHEOffline/Prover.cpp b/FHEOffline/Prover.cpp
index 230c44fb..96204976 100644
--- a/FHEOffline/Prover.cpp
+++ b/FHEOffline/Prover.cpp
@@ -15,13 +15,13 @@ Prover<FD,U>::Prover(Proof& proof, const FD& FieldD) :
   s.resize(proof.V, proof.pk->get_params());
   y.resize(proof.V, FieldD);
 #ifdef LESS_ALLOC_MORE_MEM
-  s.allocate_slots(bigint(1) << proof.B_rand_length);
-  y.allocate_slots(bigint(1) << proof.B_plain_length);
   t = s[0];
   z = y[0];
   // extra limb to prevent reallocation
   t.allocate_slots(bigint(1) << (proof.B_rand_length + 64));
   z.allocate_slots(bigint(1) << (proof.B_plain_length + 64));
+  s.allocate_slots(bigint(1) << proof.B_rand_length);
+  y.allocate_slots(bigint(1) << proof.B_plain_length);
 #endif
 }
 
diff --git a/FHEOffline/SimpleMachine.cpp b/FHEOffline/SimpleMachine.cpp
index 22bc8386..98bfd2dc 100644
--- a/FHEOffline/SimpleMachine.cpp
+++ b/FHEOffline/SimpleMachine.cpp
@@ -299,12 +299,6 @@ void MachineBase::run()
             << timer.elapsed() << " seconds" << endl;
     cout << "CPU time: " << cpu_timer.elapsed() << endl;
 
-    extern unsigned long long sent_amount, sent_counter;
-    cout << "Data sent = " << sent_amount << " bytes in " << sent_counter
-            << " calls, ";
-    cout << sent_amount / sent_counter / N.num_players() << " bytes per call"
-            << endl;
-
     cout << "Time: " << timer.elapsed() << endl;
     cout << "Throughput: " << total / timer.elapsed() << endl;
     mult_performance();
diff --git a/GC/MaliciousRepSecret.h b/GC/MaliciousRepSecret.h
index 40d82a4c..da255180 100644
--- a/GC/MaliciousRepSecret.h
+++ b/GC/MaliciousRepSecret.h
@@ -52,46 +52,59 @@ public:
     }
 };
 
-class MaliciousRepSecret : public ReplicatedSecret<MaliciousRepSecret>
+template<class U>
+class MalRepSecretBase : public ReplicatedSecret<U>
 {
-    typedef ReplicatedSecret<MaliciousRepSecret> super;
+    typedef ReplicatedSecret<U> super;
 
 public:
-    typedef Memory<MaliciousRepSecret> DynamicMemory;
+    typedef Memory<U> DynamicMemory;
 
-    typedef MaliciousRepMC<MaliciousRepSecret> MC;
+    typedef MaliciousRepMC<U> MC;
     typedef MC MAC_Check;
 
-    typedef Beaver<MaliciousRepSecret> Protocol;
-    typedef ReplicatedInput<MaliciousRepSecret> Input;
-    typedef RepPrep<MaliciousRepSecret> LivePrep;
+    typedef ReplicatedInput<U> Input;
+    typedef RepPrep<U> LivePrep;
 
-    typedef MaliciousRepSecret part_type;
-    typedef MaliciousRepSecret whole_type;
-
-    typedef SmallMalRepSecret small_type;
+    typedef U part_type;
+    typedef U whole_type;
 
     static const bool expensive_triples = true;
 
-    static MC* new_mc(mac_key_type)
+    static MC* new_mc(BitVec)
     {
         try
         {
-            if (ThreadMaster<MaliciousRepSecret>::s().machine.more_comm_less_comp)
-                return new CommMaliciousRepMC<MaliciousRepSecret>;
+            if (ThreadMaster<U>::s().machine.more_comm_less_comp)
+                return new CommMaliciousRepMC<U>;
         }
         catch(no_singleton& e)
         {
         }
-        return new HashMaliciousRepMC<MaliciousRepSecret>;
+        return new HashMaliciousRepMC<U>;
     }
 
-    static MaliciousRepSecret constant(const BitVec& other, int my_num, const BitVec& alphai)
+    static U constant(const BitVec& other, int my_num, const BitVec& alphai)
     {
         (void) my_num, (void) alphai;
         return other;
     }
 
+    MalRepSecretBase() {}
+    template<class T>
+    MalRepSecretBase(const T& other) : super(other) {}
+};
+
+class MaliciousRepSecret : public MalRepSecretBase<MaliciousRepSecret>
+{
+    typedef MaliciousRepSecret This;
+    typedef MalRepSecretBase<This> super;
+
+public:
+    typedef Beaver<MaliciousRepSecret> Protocol;
+
+    typedef SmallMalRepSecret small_type;
+
     MaliciousRepSecret() {}
     template<class T>
     MaliciousRepSecret(const T& other) : super(other) {}
diff --git a/GC/PostSacriBin.cpp b/GC/PostSacriBin.cpp
new file mode 100644
index 00000000..81341cf0
--- /dev/null
+++ b/GC/PostSacriBin.cpp
@@ -0,0 +1,120 @@
+/*
+ * Abfllnoww.cpp
+ *
+ */
+
+#include "PostSacriBin.h"
+
+#include "Processor/Processor.h"
+
+#include "Protocols/Replicated.hpp"
+#include "Protocols/MaliciousRepMC.hpp"
+#include "ShareSecret.hpp"
+
+namespace GC
+{
+
+PostSacriBin::PostSacriBin(Player& P) :
+        ReplicatedBase(P), honest(P)
+{
+}
+
+PostSacriBin::~PostSacriBin()
+{
+    if (not inputs.empty())
+    {
+        cerr << "unchecked ANDs" << endl;
+        terminate();
+    }
+}
+
+void PostSacriBin::init_mul(SubProcessor<T>* proc)
+{
+    assert(proc != 0);
+    init_mul(proc->DataF, proc->MC);
+}
+
+void PostSacriBin::init_mul(Preprocessing<T>&, T::MC&)
+{
+    if ((int) inputs.size() >= OnlineOptions::singleton.batch_size)
+        check();
+    honest.init_mul();
+}
+
+PostSacriBin::T::clear PostSacriBin::prepare_mul(const T& x, const T& y, int n)
+{
+    honest.prepare_mul(x, y, n);
+    inputs.push_back({{x.mask(n), y.mask(n)}});
+    return {};
+}
+
+void PostSacriBin::exchange()
+{
+    honest.exchange();
+}
+
+PostSacriBin::T PostSacriBin::finalize_mul(int n)
+{
+    auto res = honest.finalize_mul(n);
+    outputs.push_back({res, n});
+    return res;
+}
+
+void PostSacriBin::check()
+{
+    vector<array<T, 3>> to_check;
+    assert(inputs.size() == outputs.size());
+    for (size_t i = 0; i < inputs.size(); i++)
+        to_check.push_back({{inputs[i][0], inputs[i][1], outputs[i].first}});
+    GlobalPRNG G(P);
+    for (size_t i = 0; i < inputs.size(); i++)
+        to_check.push_back(get_d1_triple(G, outputs[i].second));
+    HashMaliciousRepMC<T> MC;
+    vector<array<T, 3>> _(N);
+    TripleShuffleSacrifice<T>(2, 6).triple_sacrifice(_, to_check, P, MC, 0, inputs.size());
+    MC.Check(P);
+    inputs.clear();
+    outputs.clear();
+}
+
+array<PostSacriBin::T, 3> PostSacriBin::get_d1_triple(GlobalPRNG& G, int n_bits)
+{
+    while (d1.size() < N)
+        d1.push_back(get_d2_triple(T::N_BITS));
+    int i = G.get_uint(N);
+    auto tmp = d1.at(i).mask(n_bits);
+    d1[i] <<= n_bits;
+    d1[i] ^= get_d2_triple(n_bits);
+    array<T, 3> res({{tmp[0], tmp[1], tmp[2]}});
+    return res;
+}
+
+array<PostSacriBin::T, 3> PostSacriBin::get_d2_triple(int n_bits)
+{
+    return get_triple_no_count(n_bits);
+}
+
+void PostSacriBin::get(Dtype type, T* res)
+{
+    assert(type == DATA_TRIPLE);
+
+    if (d2.empty())
+    {
+        TripleShuffleSacrifice<T> sacrifice(2, 6);
+        vector<array<T, 3>> check_triples;
+
+        // optimistic triple generation
+        Replicated<T> protocol(P);
+        generate_triples(check_triples, 2 * N + 6, &protocol, T::N_BITS);
+        HashMaliciousRepMC<T> MC;
+        sacrifice.triple_sacrifice(d2, check_triples, P, MC, 0);
+        MC.Check(P);
+        assert(d2.size() == N);
+    }
+
+    for (int i = 0; i < 3; i++)
+        res[i] = d2.back()[i];
+    d2.pop_back();
+}
+
+} /* namespace GC */
diff --git a/GC/PostSacriBin.h b/GC/PostSacriBin.h
new file mode 100644
index 00000000..50baa9c5
--- /dev/null
+++ b/GC/PostSacriBin.h
@@ -0,0 +1,52 @@
+/*
+ * Abfllnoww.h
+ *
+ */
+
+#ifndef GC_POSTSACRIBIN_H_
+#define GC_POSTSACRIBIN_H_
+
+#include "PostSacriSecret.h"
+#include "Protocols/Replicated.h"
+#include "ShiftableTripleBuffer.h"
+
+namespace GC
+{
+
+class PostSacriBin : public ReplicatedBase,
+        public ProtocolBase<PostSacriSecret>,
+        ShiftableTripleBuffer<PostSacriSecret>
+{
+    typedef PostSacriSecret T;
+
+    Replicated<T> honest;
+
+    vector<array<T, 2>> inputs;
+    vector<pair<T, int>> outputs;
+
+    // as in Araki et al. (S&P'17)
+    vector<FixedVec<T, 3>> d1;
+    vector<array<T, 3>> d2;
+
+    array<T, 3> get_d1_triple(GlobalPRNG& G, int n_bits);
+    array<T, 3> get_d2_triple(int n_bits);
+    void get(Dtype type, T* res);
+
+    const size_t N = 1 << 20;
+
+public:
+    PostSacriBin(Player& P);
+    ~PostSacriBin();
+
+    void init_mul(Preprocessing<T>&, T::MC&);
+    void init_mul(SubProcessor<T>* proc);
+    T::clear prepare_mul(const T& x, const T& y, int n = -1);
+    void exchange();
+    T finalize_mul(int n = -1);
+
+    void check();
+};
+
+} /* namespace GC */
+
+#endif /* GC_POSTSACRIBIN_H_ */
diff --git a/GC/PostSacriSecret.h b/GC/PostSacriSecret.h
new file mode 100644
index 00000000..f285b208
--- /dev/null
+++ b/GC/PostSacriSecret.h
@@ -0,0 +1,37 @@
+/*
+ * AbfllnowShare.h
+ *
+ */
+
+#ifndef GC_POSTSACRISECRET_H_
+#define GC_POSTSACRISECRET_H_
+
+#include "MaliciousRepSecret.h"
+
+namespace GC
+{
+
+class PostSacriBin;
+
+class PostSacriSecret : public MalRepSecretBase<PostSacriSecret>
+{
+    typedef PostSacriSecret This;
+    typedef MalRepSecretBase<This> super;
+
+public:
+    typedef PostSacriBin Protocol;
+
+    PostSacriSecret()
+    {
+    }
+
+    template<class T>
+    PostSacriSecret(const T& other) :
+            super(other)
+    {
+    }
+};
+
+}
+
+#endif
diff --git a/GC/Program.hpp b/GC/Program.hpp
index 9815c0b8..f1547f59 100644
--- a/GC/Program.hpp
+++ b/GC/Program.hpp
@@ -14,6 +14,8 @@
 
 #include "Tools/callgrind.h"
 
+#include "Processor/Instruction.hpp"
+
 namespace GC
 {
 
diff --git a/GC/ShareParty.h b/GC/ShareParty.h
index 9e7e3a56..389efa33 100644
--- a/GC/ShareParty.h
+++ b/GC/ShareParty.h
@@ -22,7 +22,7 @@ class ShareParty : public ThreadMaster<T>
 {
     static ShareParty<T>* singleton;
 
-    ez::ezOptionParser opt;
+    ez::ezOptionParser& opt;
     OnlineOptions online_opts;
 
 public:
@@ -30,7 +30,8 @@ public:
 
     typename T::mac_key_type mac_key;
 
-    ShareParty(int argc, const char** argv, int default_batch_size = 0);
+    ShareParty(int argc, const char** argv, ez::ezOptionParser& opt,
+            int default_batch_size = 0);
 
     Thread<T>* new_thread(int i);
 
diff --git a/GC/ShareParty.hpp b/GC/ShareParty.hpp
index 926a74f3..c38bac03 100644
--- a/GC/ShareParty.hpp
+++ b/GC/ShareParty.hpp
@@ -17,6 +17,10 @@
 
 #include "ShareThread.hpp"
 #include "RepPrep.hpp"
+#include "ThreadMaster.hpp"
+#include "Thread.hpp"
+#include "ShareSecret.hpp"
+
 #include "Protocols/Replicated.hpp"
 #include "Protocols/ReplicatedPrep.hpp"
 #include "Protocols/MaliciousRepMC.hpp"
@@ -29,16 +33,31 @@ template<class T>
 ShareParty<T>* ShareParty<T>::singleton = 0;
 
 template<class T>
-ShareParty<T>::ShareParty(int argc, const char** argv, int default_batch_size) :
-        ThreadMaster<T>(online_opts), online_opts(opt, argc, argv,
+void simple_binary_main(int argc, const char** argv, int default_batch_size = 0)
+{
+    ez::ezOptionParser opt;
+    ShareParty<T>(argc, argv, opt, default_batch_size);
+}
+
+template<class T>
+ShareParty<T>::ShareParty(int argc, const char** argv, ez::ezOptionParser& opt,
+        int default_batch_size) :
+        ThreadMaster<T>(online_opts), opt(opt),
+        online_opts(this->opt, argc, argv,
                 default_batch_size)
 {
     if (singleton)
         throw runtime_error("there can only be one");
     singleton = this;
 
+    int nplayers = 0;
+    opt.parse(argc, argv);
+    if (opt.get("-N"))
+        opt.get("-N")->getInt(nplayers);
+    opt.resetArgs();
     NetworkOptionsWithNumber network_opts(opt, argc, argv,
-            T::dishonest_majority ? 2 : 3, T::variable_players);
+            nplayers > 0 ? nplayers : (T::dishonest_majority ? 2 : 3),
+            T::variable_players and nplayers == 0);
     if (T::dishonest_majority)
         opt.add(
                 "", // Default.
diff --git a/GC/square64.cpp b/GC/square64.cpp
index 5217a090..512b7baa 100644
--- a/GC/square64.cpp
+++ b/GC/square64.cpp
@@ -25,7 +25,7 @@ union matrix32x8
 
     void transpose(square64& output, int x, int y)
     {
-#ifdef __AVX2__
+#if defined(__AVX2__) || !defined(__x86_64__)
         if (cpu_has_avx2())
         {
             for (int j = 0; j < 8; j++)
@@ -66,7 +66,7 @@ case I: \
 void zip(int chunk_size, __m256i& lows, __m256i& highs,
         const __m256i& a, const __m256i& b)
 {
-#ifdef __AVX2__
+#if defined(__AVX2__) || !defined(__x86_64__)
     if (cpu_has_avx2())
     {
         switch (chunk_size)
diff --git a/GC/square64.h b/GC/square64.h
index 03bf553a..84badde9 100644
--- a/GC/square64.h
+++ b/GC/square64.h
@@ -6,10 +6,10 @@
 #ifndef GC_SQUARE64_H_
 #define GC_SQUARE64_H_
 
-#include <immintrin.h>
 #include <string.h>
 #include <cstdint>
 #include "Tools/int.h"
+#include "Tools/intrinsics.h"
 
 union square64
 {
diff --git a/Machines/ccd-party.cpp b/Machines/ccd-party.cpp
index 4a7c1045..765699c0 100644
--- a/Machines/ccd-party.cpp
+++ b/Machines/ccd-party.cpp
@@ -17,5 +17,7 @@
 int main(int argc, const char** argv)
 {
     gf2n_short::init_field(40);
-    GC::ShareParty<GC::CcdSecret<gf2n_short>>(argc, argv);
+    ez::ezOptionParser opt;
+    ShamirOptions::singleton = {opt, argc, argv};
+    GC::ShareParty<GC::CcdSecret<gf2n_short>>(argc, argv, opt);
 }
diff --git a/Machines/malicious-ccd-party.cpp b/Machines/malicious-ccd-party.cpp
index c083a4c2..a4354206 100644
--- a/Machines/malicious-ccd-party.cpp
+++ b/Machines/malicious-ccd-party.cpp
@@ -18,5 +18,7 @@
 int main(int argc, const char** argv)
 {
     gf2n_short::init_field(40);
-    GC::ShareParty<GC::MaliciousCcdSecret<gf2n_short>>(argc, argv);
+    ez::ezOptionParser opt;
+    ShamirOptions::singleton = {opt, argc, argv};
+    GC::ShareParty<GC::MaliciousCcdSecret<gf2n_short>>(argc, argv, opt);
 }
diff --git a/Machines/malicious-rep-bin-party.cpp b/Machines/malicious-rep-bin-party.cpp
index eb641e6d..2ae79671 100644
--- a/Machines/malicious-rep-bin-party.cpp
+++ b/Machines/malicious-rep-bin-party.cpp
@@ -21,5 +21,5 @@
 
 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::MaliciousRepSecret>(argc, argv);
+    GC::simple_binary_main<GC::MaliciousRepSecret>(argc, argv);
 }
diff --git a/Machines/ps-rep-bin-party.cpp b/Machines/ps-rep-bin-party.cpp
new file mode 100644
index 00000000..98ffb298
--- /dev/null
+++ b/Machines/ps-rep-bin-party.cpp
@@ -0,0 +1,14 @@
+/*
+ * abfllnow-party.cpp
+ *
+ */
+
+#include "GC/PostSacriBin.h"
+#include "GC/PostSacriSecret.h"
+
+#include "GC/ShareParty.hpp"
+
+int main(int argc, const char** argv)
+{
+    GC::simple_binary_main<GC::PostSacriSecret>(argc, argv);
+}
diff --git a/Machines/replicated-bin-party.cpp b/Machines/replicated-bin-party.cpp
index 7a755170..763b1918 100644
--- a/Machines/replicated-bin-party.cpp
+++ b/Machines/replicated-bin-party.cpp
@@ -20,5 +20,5 @@
 
 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::SemiHonestRepSecret>(argc, argv);
+    GC::simple_binary_main<GC::SemiHonestRepSecret>(argc, argv);
 }
diff --git a/Machines/semi-bin-party.cpp b/Machines/semi-bin-party.cpp
index ee5d92d7..fbd0a634 100644
--- a/Machines/semi-bin-party.cpp
+++ b/Machines/semi-bin-party.cpp
@@ -23,5 +23,5 @@
 
 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::SemiSecret>(argc, argv);
+    GC::simple_binary_main<GC::SemiSecret>(argc, argv);
 }
diff --git a/Machines/tinier-party.cpp b/Machines/tinier-party.cpp
index f4c0fbe9..1a9f5fe5 100644
--- a/Machines/tinier-party.cpp
+++ b/Machines/tinier-party.cpp
@@ -27,5 +27,5 @@
 int main(int argc, const char** argv)
 {
     gf2n_short::init_field(40);
-    GC::ShareParty<GC::TinierSecret<gf2n_short>>(argc, argv, 1000);
+    GC::simple_binary_main<GC::TinierSecret<gf2n_short>>(argc, argv, 1000);
 }
diff --git a/Machines/tiny-party.cpp b/Machines/tiny-party.cpp
index f96f5490..e24df7a1 100644
--- a/Machines/tiny-party.cpp
+++ b/Machines/tiny-party.cpp
@@ -26,5 +26,5 @@
 
 int main(int argc, const char** argv)
 {
-    GC::ShareParty<GC::TinySecret<40>>(argc, argv, 1000);
+    GC::simple_binary_main<GC::TinySecret<40>>(argc, argv, 1000);
 }
diff --git a/Makefile b/Makefile
index d691ace0..6406b28e 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,12 @@ VM = $(PROCESSOR) $(COMMON) GC/square64.o GC/Instruction.o OT/OTTripleSetup.o OT
 
 LIB = libSPDZ.a
 LIBRELEASE = librelease.a
+
+ifeq ($(AVX_OT), 0)
+LIBSIMPLEOT = ECDSA/P256Element.o
+else
 LIBSIMPLEOT = SimpleOT/libsimpleot.a
+endif
 
 # used for dependency generation
 OBJS = $(BMR) $(FHEOFFLINE) $(TINYOTOFFLINE) $(YAO) $(COMPLETE) $(patsubst %.cpp,%.o,$(wildcard Machines/*.cpp Utils/*.cpp))
@@ -47,7 +52,6 @@ binary: rep-bin yao semi-bin-party.x tinier-party.x tiny-party.x ccd-party.x mal
 
 ifeq ($(USE_NTL),1)
 all: overdrive she-offline
-gear: cowgear-party.x chaigear-party.x lowgear-party.x highgear-party.x
 arithmetic: hemi-party.x soho-party.x gear
 endif
 
@@ -73,13 +77,14 @@ yao: yao-party.x
 
 she-offline: Check-Offline.x spdz2-offline.x
 
-overdrive: simple-offline.x pairwise-offline.x cnc-offline.x
+overdrive: simple-offline.x pairwise-offline.x cnc-offline.x gear
+gear: cowgear-party.x chaigear-party.x lowgear-party.x highgear-party.x
 
 rep-field: malicious-rep-field-party.x replicated-field-party.x ps-rep-field-party.x
 
 rep-ring: replicated-ring-party.x brain-party.x malicious-rep-ring-party.x ps-rep-ring-party.x rep4-ring-party.x
 
-rep-bin: replicated-bin-party.x malicious-rep-bin-party.x Fake-Offline.x
+rep-bin: replicated-bin-party.x malicious-rep-bin-party.x ps-rep-bin-party.x Fake-Offline.x
 
 replicated: rep-field rep-ring rep-bin
 
@@ -96,6 +101,10 @@ else
 tldr: mpir
 endif
 
+ifeq ($(MACHINE), aarch64)
+tldr: simde/simde
+endif
+
 shamir: shamir-party.x malicious-shamir-party.x galois-degree.x
 
 sy: sy-rep-field-party.x sy-rep-ring-party.x sy-shamir-party.x
@@ -107,10 +116,10 @@ $(LIBRELEASE): Protocols/MalRepRingOptions.o $(PROCESSOR) $(COMMON) $(OT) $(GC)
 	$(AR) -csr $@ $^
 
 static/%.x: Machines/%.o $(LIBRELEASE) $(LIBSIMPLEOT)
-	$(CXX) $(CFLAGS) -o $@ $^ $(LIBRELEASE) $(LIBSIMPLEOT) -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl
+	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++  $(LIBRELEASE) $(LIBSIMPLEOT) $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl
 
 static/%.x: ECDSA/%.o ECDSA/P256Element.o $(VM) $(OT) $(LIBSIMPLEOT)
-	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) $(ECLIB) -Wl,-Bdynamic -ldl
+	$(CXX) $(CFLAGS) -o $@ $^ -Wl,-Map=$<.map -Wl,-Bstatic -static-libgcc -static-libstdc++ $(BOOST) $(LDLIBS) -Wl,-Bdynamic -ldl
 
 static-dir:
 	@ mkdir static 2> /dev/null; true
@@ -118,7 +127,7 @@ static-dir:
 static-release: static-dir $(patsubst Machines/%.cpp, static/%.x, $(wildcard Machines/*-party.cpp)) static/emulate.x
 
 Fake-ECDSA.x: ECDSA/Fake-ECDSA.cpp ECDSA/P256Element.o $(COMMON) Processor/PrepBase.o
-	$(CXX) -o $@ $^ $(CFLAGS) $(LDLIBS) $(ECLIB)
+	$(CXX) -o $@ $^ $(CFLAGS) $(LDLIBS)
 
 Check-Offline.x: $(PROCESSOR)
 
@@ -167,14 +176,24 @@ secure.x: Utils/secure.o
 %.x: Machines/%.o $(VM) OT/OTTripleSetup.o OT/BaseOT.o $(LIBSIMPLEOT)
 	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS)
 
+%gear-party.x: Machines/%gear-party.o $(VM) OT/OTTripleSetup.o OT/BaseOT.o $(LIBSIMPLEOT)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
+hemi-party.x: Machines/hemi-party.o $(VM)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
+soho-party.x: Machines/soho-party.o $(VM)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) -lntl
+
 %-ecdsa-party.x: ECDSA/%-ecdsa-party.o ECDSA/P256Element.o $(VM)
-	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS) $(ECLIB)
+	$(CXX) -o $@ $(CFLAGS) $^ $(LDLIBS)
 
 replicated-bin-party.x: GC/square64.o
 replicated-ring-party.x: GC/square64.o
 replicated-field-party.x: GC/square64.o
 brain-party.x: GC/square64.o
 malicious-rep-bin-party.x: GC/square64.o
+ps-rep-bin-party.x: GC/PostSacriBin.o
 semi-bin-party.x: $(VM) $(OT) GC/SemiSecret.o GC/SemiPrep.o GC/square64.o
 tiny-party.x: $(OT)
 tinier-party.x: $(OT)
@@ -220,6 +239,7 @@ static/semi-bmr-party.x: $(BMR)
 static/real-bmr-party.x: $(BMR)
 static/bmr-program-party.x: $(BMR)
 
+ifeq ($(AVX_OT), 1)
 $(LIBSIMPLEOT): SimpleOT/Makefile
 	$(MAKE) -C SimpleOT
 
@@ -227,6 +247,7 @@ OT/BaseOT.o: SimpleOT/Makefile
 
 SimpleOT/Makefile:
 	git submodule update --init SimpleOT
+endif
 
 .PHONY: Programs/Circuits
 Programs/Circuits:
@@ -259,5 +280,8 @@ mac-setup:
 	-echo MY_LDLIBS += -L/usr/local/opt/openssl/lib >> CONFIG.mine
 	-echo USE_NTL = 1 >> CONFIG.mine
 
+simde/simde:
+	git submodule update --init simde
+
 clean:
 	-rm -f */*.o *.o */*.d *.d *.x core.* *.a gmon.out */*/*.o static/*.x
diff --git a/Math/FixedVec.h b/Math/FixedVec.h
index df51fa21..e579b3f6 100644
--- a/Math/FixedVec.h
+++ b/Math/FixedVec.h
@@ -277,6 +277,12 @@ public:
         return res;
     }
 
+    FixedVec<T, L>& operator<<=(int i)
+    {
+        *this = *this << i;
+        return *this;
+    }
+
     FixedVec<T, L>& operator>>=(int i)
     {
         *this = *this >> i;
diff --git a/Math/Integer.h b/Math/Integer.h
index 057a952b..4f750690 100644
--- a/Math/Integer.h
+++ b/Math/Integer.h
@@ -143,19 +143,6 @@ class Integer : public IntBase<long>
   friend unsigned int& operator+=(unsigned int& x, const Integer& other) { return x += other.a; }
 
   long operator-() const { return -a; }
-
-  void add(const Integer& x, const Integer& y) { *this = x + y; }
-  void sub(const Integer& x, const Integer& y) { *this = x - y; }
-  void mul(const Integer& x, const Integer& y) { *this = x * y; }
-
-  void mul(const Integer& x) { *this = *this * x; }
-
-  void AND(const Integer& x, const Integer& y) { *this = x & y; }
-  void OR(const Integer& x, const Integer& y) { *this = x | y; }
-  void XOR(const Integer& x, const Integer& y) { *this = x ^ y; }
-  void SHL(const Integer& x, const Integer& y) { *this = x << y; }
-  // unsigned shift for Mod2m
-  void SHR(const Integer& x, const Integer& y) { *this = (unsigned long)x.a >> y.a; }
 };
 
 inline string to_string(const Integer& x)
diff --git a/Math/Zp_Data.cpp b/Math/Zp_Data.cpp
index 884d7ace..bb95dc05 100644
--- a/Math/Zp_Data.cpp
+++ b/Math/Zp_Data.cpp
@@ -64,7 +64,9 @@ void Zp_Data::init(const bigint& p,bool mont)
 
 void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y,int t) const
 {
-  mp_limb_t ans[2*MAX_MOD_SZ+1],u;
+  mp_limb_t ans[2 * MAX_MOD_SZ + 1], u, yy[t + 1];
+  inline_mpn_copyi(yy, y, t);
+  yy[t] = 0;
   // First loop
   u=x[0]*y[0]*pi;
   ans[t]  = mpn_mul_1(ans,y,t,x[0]);
@@ -73,8 +75,8 @@ void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y,int t
     { // u=(ans0+xi*y0)*pd
       u=(ans[i]+x[i]*y[0])*pi;
       // ans=ans+xi*y+u*pr
-      ans[t+i]+=mpn_addmul_1(ans+i,y,t,x[i]);
-      ans[t+i+1]=mpn_addmul_1(ans+i,prA,t+1,u);
+      ans[t+i+1]=mpn_addmul_1(ans+i,yy,t+1,x[i]);
+      ans[t+i+1]+=mpn_addmul_1(ans+i,prA,t+1,u);
     }
   // if (ans>=pr) { ans=z-pr; }
   // else         { z=ans;    }
diff --git a/Math/Zp_Data.h b/Math/Zp_Data.h
index e067b85f..6dca5f02 100644
--- a/Math/Zp_Data.h
+++ b/Math/Zp_Data.h
@@ -13,8 +13,8 @@
 #include "Math/bigint.h"
 #include "Math/mpn_fixed.h"
 #include "Tools/random.h"
+#include "Tools/intrinsics.h"
 
-#include <smmintrin.h>
 #include <iostream>
 using namespace std;
 
@@ -43,6 +43,8 @@ class Zp_Data
   void Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y, int t) const;
   void Mont_Mult_variable(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t* y) const
   { Mont_Mult(z, x, y, t); }
+  void Mont_Mult_max(mp_limb_t* z, const mp_limb_t* x, const mp_limb_t* y,
+      int max_t) const;
 
   public:
 
@@ -125,7 +127,7 @@ inline void Zp_Data::Add<0>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y
 template<>
 inline void Zp_Data::Add<1>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y) const
 {
-#ifdef __clang__
+#if defined(__clang__) || !defined(__x86_64__)
   Add<0>(ans, x, y);
 #else
   *ans = *x + *y;
@@ -139,7 +141,7 @@ inline void Zp_Data::Add<1>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y
 template<>
 inline void Zp_Data::Add<2>(mp_limb_t* ans,const mp_limb_t* x,const mp_limb_t* y) const
 {
-#ifdef __clang__
+#if defined(__clang__) || !defined(__x86_64__)
   Add<0>(ans, x, y);
 #else
   __uint128_t a, b, p;
@@ -229,7 +231,7 @@ inline void Zp_Data::Mont_Mult_(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t*
     { // u=(ans0+xi*y0)*pd
       u=(ans[i]+x[i]*y[0])*pi;
       // ans=ans+xi*y+u*pr
-      mpn_addmul_1_fixed_<T + 1, T>(ans+i,y,x[i]);
+      mpn_addmul_1_fixed_<T + 2, T>(ans+i,y,x[i]);
       mpn_addmul_1_fixed_<T + 2, T + 1>(ans+i,prA,u);
     }
   // if (ans>=pr) { ans=z-pr; }
@@ -276,4 +278,11 @@ inline void Zp_Data::Mont_Mult(mp_limb_t* z,const mp_limb_t* x,const mp_limb_t*
   }
 }
 
+inline void Zp_Data::Mont_Mult_max(mp_limb_t* z, const mp_limb_t* x,
+    const mp_limb_t* y, int max_t) const
+{
+  assert(t <= max_t);
+  Mont_Mult(z, x, y);
+}
+
 #endif
diff --git a/Math/bigint.cpp b/Math/bigint.cpp
index 0552bf05..1107c969 100644
--- a/Math/bigint.cpp
+++ b/Math/bigint.cpp
@@ -153,6 +153,11 @@ bigint::bigint(const gfpvar& other)
   to_bigint(*this, other.get(), other.get_ZpD());
 }
 
+bigint::bigint(const mp_limb_t* data, size_t n_limbs)
+{
+  mpz_import(get_mpz_t(), n_limbs, -1, 8, -1, 0, data);
+}
+
 string to_string(const bigint& x)
 {
   stringstream ss;
diff --git a/Math/bigint.h b/Math/bigint.h
index 697aa31f..081a49c4 100644
--- a/Math/bigint.h
+++ b/Math/bigint.h
@@ -63,6 +63,7 @@ public:
   bigint(const fixint<L>& x) : bigint(typename fixint<L>::super(x)) {}
   bigint(const Integer& x);
   bigint(const GC::Clear& x);
+  bigint(const mp_limb_t* data, size_t n_limbs);
 
   bigint& operator=(int n);
   bigint& operator=(long n);
@@ -75,6 +76,11 @@ public:
   template<int K>
   bigint& operator=(const SignedZ2<K>& x);
 
+  template<int X, int L>
+  bigint& from_signed(const gfp_<X, L>& other);
+  template<class T>
+  bigint& from_signed(const T& other);
+
   void allocate_slots(const bigint& x) { *this = x; }
   int get_min_alloc() { return get_mpz_t()->_mp_alloc; }
 
diff --git a/Math/bigint.hpp b/Math/bigint.hpp
index a5b19523..9662d0bf 100644
--- a/Math/bigint.hpp
+++ b/Math/bigint.hpp
@@ -9,12 +9,27 @@
 #include "bigint.h"
 #include "Integer.h"
 
+template<int X, int L>
+bigint& bigint::from_signed(const gfp_<X, L>& other)
+{
+    to_signed_bigint(*this, other);
+    return *this;
+}
+
+template<class T>
+bigint& bigint::from_signed(const T& other)
+{
+    *this = other;
+    return *this;
+}
+
 template<class T>
 mpf_class bigint::get_float(T v, T p, T z, T s)
 {
     // MPIR can't handle more precision in exponent
     Integer exp = Integer(p, 31).get();
-    bigint tmp = v;
+    bigint tmp;
+    tmp.from_signed(v);
     mpf_class res = tmp;
     if (exp > 0)
         mpf_mul_2exp(res.get_mpf_t(), res.get_mpf_t(), exp.get());
diff --git a/Math/fixint.h b/Math/fixint.h
index adcda3dc..c10d3c26 100644
--- a/Math/fixint.h
+++ b/Math/fixint.h
@@ -11,7 +11,7 @@
 template<int L>
 class fixint : public SignedZ2<64 * (L + 1)>
 {
-    static const int OVERFLOW = 60;
+    static const int N_OVERFLOW = 60;
 
 public:
     typedef SignedZ2<64 * (L + 1)> super;
@@ -24,7 +24,7 @@ public:
     fixint(const T& other) :
             super(other)
     {
-        auto check = mp_limb_signed_t(this->a[this->N_WORDS - 1]) >> OVERFLOW;
+        auto check = mp_limb_signed_t(this->a[this->N_WORDS - 1]) >> N_OVERFLOW;
         assert(check == 0 or check == -1);
     }
 
@@ -70,10 +70,10 @@ public:
     void allocate_slots(const T& limit)
     {
         int n_bits = this->size_in_bits();
-        if (numBits(limit) - OVERFLOW > n_bits)
+        if (numBits(limit) - N_OVERFLOW > n_bits)
         {
-        cerr << "cannot hold " << numBits(limit) << " bits, " << n_bits
-                << " available" << endl;
+            cerr << "maybe change N_LIMBS_RAND to at least "
+                    << ((numBits(limit) - N_OVERFLOW) / 64) << endl;
             throw runtime_error("fixed-length integer too small");
         }
     }
diff --git a/Math/gf2n.cpp b/Math/gf2n.cpp
index f10ee96b..987b8a4a 100644
--- a/Math/gf2n.cpp
+++ b/Math/gf2n.cpp
@@ -2,12 +2,10 @@
 #include "Math/gf2n.h"
 #include "Math/Bit.h"
 
+#include "Tools/intrinsics.h"
 #include "Tools/Exceptions.h"
 
 #include <stdint.h>
-#include <wmmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
 
 const false_type ValueInterface::characteristic_two;
 const false_type ValueInterface::prime_field;
@@ -16,6 +14,9 @@ const false_type ValueInterface::invertible;
 const true_type gf2n_short::characteristic_two;
 const true_type gf2n_long::characteristic_two;
 
+const true_type gf2n_short::invertible;
+const true_type gf2n_long::invertible;
+
 int gf2n_short::n = 0;
 int gf2n_short::t1;
 int gf2n_short::t2;
diff --git a/Math/gf2nlong.cpp b/Math/gf2nlong.cpp
index f0d32baf..44369d9d 100644
--- a/Math/gf2nlong.cpp
+++ b/Math/gf2nlong.cpp
@@ -6,12 +6,10 @@
 #include "gf2nlong.h"
 #include "gf2n.h"
 
+#include "Tools/intrinsics.h"
 #include "Tools/Exceptions.h"
 
 #include <stdint.h>
-#include <wmmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
 
 
 bool is_ge(__m128i a, __m128i b)
diff --git a/Math/gf2nlong.h b/Math/gf2nlong.h
index 03a0383f..77467f69 100644
--- a/Math/gf2nlong.h
+++ b/Math/gf2nlong.h
@@ -12,9 +12,8 @@
 #include <iostream>
 using namespace std;
 
-#include <smmintrin.h>
-
 #include "Tools/random.h"
+#include "Tools/intrinsics.h"
 #include "Math/field_types.h"
 #include "Math/bigint.h"
 
@@ -283,7 +282,7 @@ inline __m128i software_clmul(__m128i a, __m128i b, int choice)
 template<int choice>
 inline __m128i clmul(__m128i a, __m128i b)
 {
-#ifdef __PCLMUL__
+#if defined(__PCLMUL__) || !defined(__x86_64__)
     if (cpu_has_pclmul())
     {
         return _mm_clmulepi64_si128(a, b, choice);
diff --git a/Math/gfpvar.cpp b/Math/gfpvar.cpp
index d4f8ac48..7b7360fd 100644
--- a/Math/gfpvar.cpp
+++ b/Math/gfpvar.cpp
@@ -9,6 +9,9 @@
 
 #include "gfp.hpp"
 
+const true_type gfpvar::invertible;
+const true_type gfpvar::prime_field;
+
 Zp_Data gfpvar::ZpD;
 
 string gfpvar::type_string()
diff --git a/Math/modp.h b/Math/modp.h
index fef5956c..6cc08fa5 100644
--- a/Math/modp.h
+++ b/Math/modp.h
@@ -151,7 +151,7 @@ template<int L>
 inline void Mul(modp_<L>& ans,const modp_<L>& x,const modp_<L>& y,const Zp_Data& ZpD)
 {
   if (ZpD.montgomery)
-    { ZpD.Mont_Mult(ans.x,x.x,y.x); }
+    { ZpD.Mont_Mult_max(ans.x,x.x,y.x,L); }
   else
     { //ans.x=(x.x*y.x)%ZpD.pr;
       mp_limb_t aa[2*L],q[2*L];
diff --git a/Math/modp.hpp b/Math/modp.hpp
index 2ebc1e71..0e9d98b3 100644
--- a/Math/modp.hpp
+++ b/Math/modp.hpp
@@ -253,7 +253,7 @@ void Inv(modp_<L>& ans,const modp_<L>& x,const Zp_Data& ZpD)
   else
     { for (int i=sz; i<ZpD.t; i++) { ans.x[i]=0; } }
   if (ZpD.montgomery)
-    { ZpD.Mont_Mult(ans.x,ans.x,ZpD.R3); }
+    { ZpD.Mont_Mult_max(ans.x,ans.x,ZpD.R3,L); }
 }
 
 
diff --git a/Math/mpn_fixed.h b/Math/mpn_fixed.h
index 99cc76b5..cd891869 100644
--- a/Math/mpn_fixed.h
+++ b/Math/mpn_fixed.h
@@ -9,10 +9,10 @@
 #include <mpir.h>
 #include <string.h>
 #include <assert.h>
-#include <x86intrin.h>
 
 #include "Tools/avx_memcpy.h"
 #include "Tools/cpu_support.h"
+#include "Tools/intrinsics.h"
 
 inline void inline_mpn_zero(mp_limb_t* x, mp_size_t size)
 {
@@ -50,6 +50,7 @@ inline void mpn_add_fixed_n<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
     *res = *x + *y;
 }
 
+#ifdef __x86_64__
 template <>
 inline void mpn_add_fixed_n<2>(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y)
 {
@@ -91,6 +92,7 @@ inline void mpn_add_fixed_n<4>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
             : "cc"
     );
 }
+#endif
 
 #ifdef __clang__
 inline char clang_add_carry(char carryin, unsigned long x, unsigned long y, unsigned long& res)
@@ -133,16 +135,15 @@ mp_limb_t mpn_add_fixed_n_with_carry(mp_limb_t* res, const mp_limb_t* x, const m
 
 inline mp_limb_t mpn_sub_n_borrow(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y, int n)
 {
-#ifndef __clang__
-#if __GNUC__ < 7
+#if !defined(__clang__) || (__GNUC__ < 7) || !defined(__x86_64__)
     // GCC 6 can't handle the code below
     return mpn_sub_n(res, x, y, n);
-#endif
-#endif
+#else
     char borrow = 0;
     for (int i = 0; i < n; i++)
         borrow = _subborrow_u64(borrow, x[i], y[i], (unsigned long long*)&res[i]);
     return borrow;
+#endif
 }
 
 template <int N>
@@ -163,6 +164,7 @@ inline void mpn_sub_fixed_n<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
     *res = *x - *y;
 }
 
+#ifdef __x86_64__
 template <>
 inline mp_limb_t mpn_sub_fixed_n_borrow<1>(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y)
 {
@@ -235,6 +237,7 @@ inline void mpn_sub_fixed_n<4>(mp_limb_t* res, const mp_limb_t* x, const mp_limb
             : "cc"
     );
 }
+#endif
 
 inline void mpn_add_n_use_fixed(mp_limb_t* res, const mp_limb_t* x, const mp_limb_t* y, mp_size_t n)
 {
@@ -260,8 +263,8 @@ template <int L, int M, bool ADD>
 inline void mpn_addmul_1_fixed__(mp_limb_t* res, const mp_limb_t* y, mp_limb_t x)
 {
     mp_limb_t lower[L], higher[L];
-    lower[L - 1] = 0;
-    higher[L - 1] = 0;
+    inline_mpn_zero(higher + M, L - M);
+    inline_mpn_zero(lower + M, L - M);
     for (int j = 0; j < M; j++)
         lower[j] = _mulx_u64(x, y[j], (long long unsigned*)higher + j);
     if (ADD)
diff --git a/Networking/Server.cpp b/Networking/Server.cpp
index 25476114..e70cdcf5 100644
--- a/Networking/Server.cpp
+++ b/Networking/Server.cpp
@@ -158,7 +158,7 @@ void* Server::start_in_thread(void* server)
 }
 
 Server* Server::start_networking(Names& N, int my_num, int nplayers,
-        string hostname, int portnum)
+        string hostname, int portnum, int my_port)
 {
 #ifdef DEBUG_NETWORKING
   cerr << "Starting networking for " << my_num << "/" << nplayers
@@ -173,7 +173,7 @@ Server* Server::start_networking(Names& N, int my_num, int nplayers,
       pthread_create(&thread, 0, Server::start_in_thread,
           server = new Server(nplayers, portnum));
     }
-  N.init(my_num, portnum, Names::DEFAULT_PORT, hostname.c_str());
+  N.init(my_num, portnum, my_port, hostname.c_str());
   if (my_num == 0)
     {
       pthread_join(thread, 0);
diff --git a/Networking/Server.h b/Networking/Server.h
index 774b82c1..a5e833ad 100644
--- a/Networking/Server.h
+++ b/Networking/Server.h
@@ -26,7 +26,8 @@ class Server
 public:
     static void* start_in_thread(void* server);
     static Server* start_networking(Names& N, int my_num, int nplayers,
-            string hostname = "localhost", int portnum = 9000);
+            string hostname = "localhost", int portnum = 9000, int my_port =
+                    Names::DEFAULT_PORT);
 
     Server(int argc, char** argv);
     Server(int nmachines, int PortnumBase);
diff --git a/Networking/ServerSocket.cpp b/Networking/ServerSocket.cpp
index 781ace0b..0c7346ca 100644
--- a/Networking/ServerSocket.cpp
+++ b/Networking/ServerSocket.cpp
@@ -6,6 +6,7 @@
 #include <Networking/ServerSocket.h>
 #include <Networking/sockets.h>
 #include "Tools/Exceptions.h"
+#include "Tools/time-func.h"
 
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
@@ -46,10 +47,10 @@ ServerSocket::ServerSocket(int Portnum) : portnum(Portnum), thread(0)
   gethostname((char*)my_name,512);
 
   /* bind serv information to mysocket
-   *   - Just assume it will eventually wake up
    */
   fl=1;
-  while (fl!=0)
+  RunningTimer timer;
+  while (fl!=0 and timer.elapsed() < 600)
     { fl=::bind(main_socket, (struct sockaddr *)&serv, sizeof(struct sockaddr));
       if (fl != 0)
         { cerr << "Binding to socket on " << my_name << ":" << Portnum << " failed, trying again in a second ..." << endl;
@@ -136,6 +137,9 @@ void ServerSocket::accept_clients()
       struct sockaddr dest;
       memset(&dest, 0, sizeof(dest));    /* zero the struct before filling the fields */
       int socksize = sizeof(dest);
+#ifdef DEBUG_NETWORKING
+      fprintf(stderr, "Accepting...\n");
+#endif
       int consocket = accept(main_socket, (struct sockaddr *)&dest, (socklen_t*) &socksize);
       if (consocket<0) { error("set_up_socket:accept"); }
 
diff --git a/Networking/data.h b/Networking/data.h
index 60b8b032..6d7fb728 100644
--- a/Networking/data.h
+++ b/Networking/data.h
@@ -18,6 +18,18 @@
 #endif
 
 
+inline void short_memcpy(void* out, void* in, size_t n_bytes)
+{
+    switch (n_bytes)
+    {
+#define X(N) case N: avx_memcpy<N>(out, in); break;
+    X(1) X(2) X(3) X(4) X(5) X(6) X(7) X(8)
+#undef X
+    default:
+        throw invalid_length("length outside range");
+    }
+}
+
 inline void encode_length(octet *buff, size_t len, size_t n_bytes)
 {
     if (n_bytes > 8)
@@ -31,7 +43,7 @@ inline void encode_length(octet *buff, size_t len, size_t n_bytes)
     }
     // use little-endian for optimization
     uint64_t tmp = htole64(len);
-    avx_memcpy(buff, (void*)&tmp, n_bytes);
+    short_memcpy(buff, (void*)&tmp, n_bytes);
 }
 
 inline size_t decode_length(octet *buff, size_t n_bytes)
@@ -39,7 +51,7 @@ inline size_t decode_length(octet *buff, size_t n_bytes)
     if (n_bytes > 8)
         throw invalid_length("length field cannot be more than 64 bits");
     uint64_t tmp = 0;
-    avx_memcpy((void*)&tmp, buff, n_bytes);
+    short_memcpy((void*)&tmp, buff, n_bytes);
     return le64toh(tmp);
 }
 
diff --git a/Networking/sockets.cpp b/Networking/sockets.cpp
index ba434e1a..a1fe34c8 100644
--- a/Networking/sockets.cpp
+++ b/Networking/sockets.cpp
@@ -9,23 +9,12 @@ using namespace std;
 
 void error(const char *str)
 {
+  int old_errno = errno;
   char err[1000];
   gethostname(err,1000);
   strcat(err," : ");
   strcat(err,str);
-  perror(err);
-  throw bad_value();
-}
-
-void error(const char *str1,const char *str2)
-{
-  char err[1000];
-  gethostname(err,1000);
-  strcat(err," : ");
-  strcat(err,str1);
-  strcat(err,str2);
-  perror(err);
-  throw bad_value();
+  throw runtime_error(string() + err + " : " + strerror(old_errno));
 }
 
 void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
@@ -35,7 +24,7 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
    hints.ai_family = AF_INET;
    hints.ai_flags = AI_CANONNAME;
 
-   octet my_name[512];
+   char my_name[512];
    memset(my_name,0,512*sizeof(octet));
    gethostname((char*)my_name,512);
 
@@ -88,36 +77,39 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
    int attempts = 0;
    long wait = 1;
    int fl;
+   int connect_errno;
    do
-   {  fl=1;
-      while (fl==1 || errno==EINPROGRESS)
-        {
-          mysocket = socket(AF_INET, SOCK_STREAM, 0);
-          if (mysocket < 0)
-            error("set_up_socket:socket");
+   {
+       mysocket = socket(AF_INET, SOCK_STREAM, 0);
+       if (mysocket < 0)
+         error("set_up_socket:socket");
 
-          fl=connect(mysocket, addr, len);
-          attempts++;
-          if (fl != 0)
-            {
-              close(mysocket);
-              usleep(wait *= 2);
+       fl = connect(mysocket, addr, len);
+       connect_errno = errno;
+       attempts++;
+       if (fl != 0)
+         {
+           close(mysocket);
+           usleep(wait *= 2);
 #ifdef DEBUG_NETWORKING
-              string msg = "Connecting to " + string(hostname) + ":" +
-                  to_string(Portnum) + " failed";
-              perror(msg.c_str());
+           string msg = "Connecting to " + string(hostname) + ":" +
+               to_string(Portnum) + " failed";
+           errno = connect_errno;
+           perror(msg.c_str());
 #endif
-            }
-        }
+         }
+       errno = connect_errno;
    }
-   while (fl == -1 && (errno == ECONNREFUSED || errno == ETIMEDOUT)
-            && timer.elapsed() < 60);
+   while (fl == -1
+       && (errno == ECONNREFUSED || errno == ETIMEDOUT || errno == EINPROGRESS)
+       && timer.elapsed() < 60);
 
    if (fl < 0)
      {
-       cout << attempts << " attempts to " << hostname << ":" << Portnum
-           << endl;
-       error("set_up_socket:connect:", hostname);
+       throw runtime_error(
+           string() + "cannot connect from " + my_name + " to " + hostname + ":"
+               + to_string(Portnum) + " after " + to_string(attempts)
+               + " attempts in one minute because " + strerror(connect_errno));
      }
 
    freeaddrinfo(ai);
@@ -127,9 +119,6 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
   fl= setsockopt(mysocket, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int));
   if (fl<0) { error("set_up_socket:setsockopt");  }
 
-  fl=setsockopt(mysocket, SOL_SOCKET, SO_REUSEADDR, (char*)&one, sizeof(int));
-  if (fl<0) { error("set_up_socket:setsockopt"); }
-
 #ifdef __APPLE__
   int flags = fcntl(mysocket, F_GETFL, 0);
   fl = fcntl(mysocket, F_SETFL, O_NONBLOCK |  flags);
@@ -147,5 +136,3 @@ void close_client_socket(int socket)
       error(tmp);
     }
 }
-
-unsigned long long sent_amount = 0, sent_counter = 0;
diff --git a/Networking/sockets.h b/Networking/sockets.h
index e3b88d68..7f48aad1 100644
--- a/Networking/sockets.h
+++ b/Networking/sockets.h
@@ -24,7 +24,6 @@
 using namespace std;
 
 
-void error(const char *str1,const char *str2);
 void error(const char *str);
 
 void set_up_client_socket(int& mysocket,const char* hostname,int Portnum);
@@ -42,9 +41,6 @@ template<class T>
 void receive(T socket, octet* msg, size_t len);
 
 
-extern unsigned long long sent_amount, sent_counter;
-
-
 inline size_t send_non_blocking(int socket, octet* msg, size_t len)
 {
   int j = send(socket,msg,len,MSG_DONTWAIT);
@@ -66,9 +62,6 @@ inline void send(int socket,octet *msg,size_t len)
     {
       i += send_non_blocking(socket, msg + i, len - i);
     }
-
-  sent_amount += len;
-  sent_counter++;
 }
 
 template<class T>
diff --git a/OT/BaseOT.cpp b/OT/BaseOT.cpp
index 33826798..8847728e 100644
--- a/OT/BaseOT.cpp
+++ b/OT/BaseOT.cpp
@@ -7,10 +7,18 @@
 #include <fstream>
 #include <pthread.h>
 
+#ifndef NO_AVX_OT
 extern "C" {
 #include "SimpleOT/ot_sender.h"
 #include "SimpleOT/ot_receiver.h"
 }
+#endif
+
+#include "ECDSA/P256Element.h"
+
+#ifdef USE_RISTRETTO
+#include "ECDSA/CurveElement.h"
+#endif
 
 using namespace std;
 
@@ -70,7 +78,57 @@ void send_if_ot_receiver(TwoPartyPlayer* P, vector<octetStream>& os, OT_ROLE rol
 
 void BaseOT::exec_base(bool new_receiver_inputs)
 {
-    if (not cpu_has_avx())
+#ifdef NO_AVX_OT
+#ifdef USE_RISTRETTO
+    typedef CurveElement Element;
+#else
+    typedef P256Element Element;
+#endif
+
+    Element::init();
+
+    vector<Element::Scalar> as, bs;
+    vector<Element> As;
+    SeededPRNG G;
+    vector<octetStream> os(2);
+
+    if (ot_role & SENDER)
+        for (int i = 0; i < nOT; i++)
+        {
+            as.push_back(G.get<Element::Scalar>());
+            As.push_back(as.back());
+            As.back().pack(os[0]);
+        }
+
+    send_if_ot_sender(P, os, ot_role);
+    os[0].reset_write_head();
+
+    if (ot_role & RECEIVER)
+        for (int i = 0; i < nOT; i++)
+        {
+            if (new_receiver_inputs)
+                receiver_inputs[i] = G.get_bit();
+            auto b = G.get<Element::Scalar>();
+            Element B = b;
+            auto A = os[1].get<Element>();
+            if (receiver_inputs[i])
+                B += A;
+            B.pack(os[0]);
+            receiver_outputs[i] = (A * b).hash(AES_BLK_SIZE);
+        }
+
+    send_if_ot_receiver(P, os, ot_role);
+
+    if (ot_role & SENDER)
+        for (int i = 0; i < nOT; i++)
+        {
+            auto B = os[1].get<Element>();
+            sender_inputs.at(i).at(0) = (B * as[i]).hash(AES_BLK_SIZE);
+            sender_inputs.at(i).at(1) = ((B - As[i]) * as[i]).hash(AES_BLK_SIZE);
+        }
+
+#else
+    if (not cpu_has_avx(true))
         throw runtime_error("SimpleOT needs AVX support");
 
     int i, j, k;
@@ -179,6 +237,7 @@ void BaseOT::exec_base(bool new_receiver_inputs)
         printf("\n");
         #endif
     }
+#endif
 
     for (int i = 0; i < nOT; i++)
     {
diff --git a/OT/BitMatrix.h b/OT/BitMatrix.h
index 72f23b7a..311f069e 100644
--- a/OT/BitMatrix.h
+++ b/OT/BitMatrix.h
@@ -6,9 +6,9 @@
 #ifndef OT_BITMATRIX_H_
 #define OT_BITMATRIX_H_
 
+#include "Tools/intrinsics.h"
+
 #include <vector>
-#include <emmintrin.h>
-#include <immintrin.h>
 #include <iostream>
 
 using namespace std;
diff --git a/OT/OTExtension.cpp b/OT/OTExtension.cpp
index 277e6de2..8d6199f6 100644
--- a/OT/OTExtension.cpp
+++ b/OT/OTExtension.cpp
@@ -5,8 +5,7 @@
 #include "Math/gf2n.h"
 #include "Tools/aes.h"
 #include "Tools/MMO.h"
-#include <wmmintrin.h>
-#include <emmintrin.h>
+#include "Tools/intrinsics.h"
 
 
 OTExtension::OTExtension(const BaseOT& baseOT, TwoPartyPlayer* player,
diff --git a/OT/square128.cpp b/OT/square128.cpp
index 227e49ca..24386209 100644
--- a/OT/square128.cpp
+++ b/OT/square128.cpp
@@ -3,13 +3,12 @@
  *
  */
 
-#include <smmintrin.h>
-#include <immintrin.h>
 #include <mpirxx.h>
 
 #include "BitMatrix.h"
 #include "Tools/random.h"
 #include "Tools/BitVector.h"
+#include "Tools/intrinsics.h"
 #include "Math/Square.h"
 
 union matrix16x8
diff --git a/Processor/Instruction.h b/Processor/Instruction.h
index 1a0508bc..891edf6e 100644
--- a/Processor/Instruction.h
+++ b/Processor/Instruction.h
@@ -287,7 +287,6 @@ enum
 
 // Register types
 enum RegType {
-  MODP,
   INT,
   SBIT,
   CBIT,
diff --git a/Processor/Instruction.hpp b/Processor/Instruction.hpp
index 6ae2f6d0..dbeb6f95 100644
--- a/Processor/Instruction.hpp
+++ b/Processor/Instruction.hpp
@@ -342,7 +342,6 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
 
       // write to external client, input is : opcode num_args, client_id, message_type, var1, var2 ...
       case WRITESOCKETC:
-      case WRITESOCKETS:
       case WRITESOCKETSHARE:
       case WRITESOCKETINT:
         num_var_args = get_int(s) - 2;
@@ -350,6 +349,8 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
         r[1] = get_int(s);
         get_vector(num_var_args, start, s);
         break;
+      case WRITESOCKETS:
+        throw runtime_error("sending MACs to client not supported any more");
       case CONNECTIPV4:
         throw runtime_error("parties as clients not supported any more");
       case READCLIENTPUBLICKEY:
@@ -590,6 +591,7 @@ int BaseInstruction::get_reg_type() const
     case SHLCI:
     case SHRCI:
     case CONVINT:
+    case PUBINPUT:
       return CINT;
     default:
       if (is_gf2n_instruction())
@@ -1145,29 +1147,17 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
         // read shares and MAC shares
         Proc.read_socket_private(Proc.read_Ci(r[0]), start, true);
         break;
-      case GREADSOCKETS:
-        //Proc.get_S2_ref(r[0]).get_share().pack(socket_octetstream);
-        //Proc.get_S2_ref(r[0]).get_mac().pack(socket_octetstream);
-        break;
       case WRITESOCKETINT:
-        Proc.write_socket(INT, CLEAR, false, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(INT, Proc.read_Ci(r[0]), r[1], start);
         break;
       case WRITESOCKETC:
-        Proc.write_socket(MODP, CLEAR, false, Proc.read_Ci(r[0]), r[1], start);
-        break;
-      case WRITESOCKETS:
-        // Send shares + MACs
-        Proc.write_socket(MODP, SECRET, true, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(CINT, Proc.read_Ci(r[0]), r[1], start);
         break;
       case WRITESOCKETSHARE:
         // Send only shares, no MACs
         // N.B. doesn't make sense to have a corresponding read instruction for this
-        Proc.write_socket(MODP, SECRET, false, Proc.read_Ci(r[0]), r[1], start);
+        Proc.write_socket(SINT, Proc.read_Ci(r[0]), r[1], start);
         break;
-      /*case GWRITESOCKETS:
-        Proc.get_S2_ref(r[0]).get_share().pack(socket_octetstream);
-        Proc.get_S2_ref(r[0]).get_mac().pack(socket_octetstream);
-        break;*/
       case WRITEFILESHARE:
         // Write shares to file system
         Proc.write_shares_to_file(start);
diff --git a/Processor/Machine.h b/Processor/Machine.h
index 2c268a46..4a68f798 100644
--- a/Processor/Machine.h
+++ b/Processor/Machine.h
@@ -45,6 +45,8 @@ class Machine : public BaseMachine
   // Keep record of used offline data
   DataPositions pos;
 
+  Player* P;
+
   void load_program(const string& threadname, const string& filename);
 
   public:
@@ -75,6 +77,7 @@ class Machine : public BaseMachine
       const string& memtype, int lg2, bool direct, int opening_sum,
       bool receive_threads, int max_broadcast, bool use_encryption, bool live_prep,
       OnlineOptions opts);
+  ~Machine();
 
   const Names& get_N() { return N; }
 
diff --git a/Processor/Machine.hpp b/Processor/Machine.hpp
index c566233e..17da8a02 100644
--- a/Processor/Machine.hpp
+++ b/Processor/Machine.hpp
@@ -49,7 +49,6 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
   // make directory for outputs if necessary
   mkdir_p(PREP_DIR);
 
-  Player* P;
   if (use_encryption)
     P = new CryptoPlayer(N, 0xF00);
   else
@@ -103,8 +102,6 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
       ot_setups.push_back({ *P, true });
   }
 
-  delete P;
-
   /* Set up the threads */
   tinfo.resize(nthreads);
   threads.resize(nthreads);
@@ -131,6 +128,12 @@ Machine<sint, sgf2n>::Machine(int my_number, Names& playerNames,
     }
 }
 
+template<class sint, class sgf2n>
+Machine<sint, sgf2n>::~Machine()
+{
+  delete P;
+}
+
 template<class sint, class sgf2n>
 void Machine<sint, sgf2n>::load_program(const string& threadname,
     const string& filename)
@@ -318,7 +321,7 @@ void Machine<sint, sgf2n>::run()
   print_timers();
   cerr << "Data sent = " << data_sent / 1e6 << " MB" << endl;
 
-  PlainPlayer P(N, 0xFF00);
+  auto& P = *this->P;
   Bundle<octetStream> bundle(P);
   bundle.mine.store(data_sent.load());
   P.Broadcast_Receive_no_stats(bundle);
diff --git a/Processor/OfflineMachine.hpp b/Processor/OfflineMachine.hpp
index b065b75d..0e33de64 100644
--- a/Processor/OfflineMachine.hpp
+++ b/Processor/OfflineMachine.hpp
@@ -34,6 +34,8 @@ template<class W>
 template<class T, class U>
 int OfflineMachine<W>::run()
 {
+    T::clear::init_default(this->online_opts.prime_length());
+    U::clear::init_field(U::clear::default_degree());
     T::bit_type::mac_key_type::init_field();
     auto binary_mac_key = read_generate_write_mac_key<typename T::bit_type>(P);
     GC::ShareThread<typename T::bit_type> thread(playerNames,
@@ -52,7 +54,6 @@ template<class W>
 template<class T>
 void OfflineMachine<W>::generate()
 {
-    T::clear::init_default(this->online_opts.prime_length());
     T::clear::next::template init<typename T::clear>(false);
     T::clear::template write_setup<T>(P.num_players());
     auto mac_key = read_generate_write_mac_key<T>(P);
diff --git a/Processor/OnlineMachine.hpp b/Processor/OnlineMachine.hpp
index 28f0d8aa..6b72b34f 100644
--- a/Processor/OnlineMachine.hpp
+++ b/Processor/OnlineMachine.hpp
@@ -200,12 +200,10 @@ void OnlineMachine::start_networking()
     } else {
       if (not opt.get("-ext-server")->isSet)
       {
-        if (my_port != Names::DEFAULT_PORT)
-          throw runtime_error("cannot set port number when not using Server.x");
         if (nplayers == 0)
           opt.get("-N")->getInt(nplayers);
         server = Server::start_networking(playerNames, mynum, nplayers,
-            hostname, pnbase);
+            hostname, pnbase, my_port);
       }
       else
       {
diff --git a/Processor/Processor.h b/Processor/Processor.h
index 137f54e2..dc152486 100644
--- a/Processor/Processor.h
+++ b/Processor/Processor.h
@@ -224,7 +224,7 @@ class Processor : public ArithmeticProcessor
   // Access to external client sockets for reading clear/shared data
   void read_socket_ints(int client_id, const vector<int>& registers);
   
-  void write_socket(const RegType reg_type, const SecrecyType secrecy_type, const bool send_macs,
+  void write_socket(const RegType reg_type,
                              int socket_id, int message_type, const vector<int>& registers);
 
   void read_socket_vector(int client_id, const vector<int>& registers);
diff --git a/Processor/Processor.hpp b/Processor/Processor.hpp
index a139338e..bb751975 100644
--- a/Processor/Processor.hpp
+++ b/Processor/Processor.hpp
@@ -241,7 +241,7 @@ void Processor<sint, sgf2n>::split(const Instruction& instruction)
 // If message_type is > 0, send message_type in bytes 0 - 3, to allow an external client to
 //  determine the data structure being sent in a message.
 template<class sint, class sgf2n>
-void Processor<sint, sgf2n>::write_socket(const RegType reg_type, const SecrecyType secrecy_type, const bool send_macs,
+void Processor<sint, sgf2n>::write_socket(const RegType reg_type,
                              int socket_id, int message_type, const vector<int>& registers)
 {
   int m = registers.size();
@@ -254,26 +254,23 @@ void Processor<sint, sgf2n>::write_socket(const RegType reg_type, const SecrecyT
 
   for (int i = 0; i < m; i++)
   {
-    if (reg_type == MODP && secrecy_type == SECRET) {
-      // Send vector of secret shares and optionally macs
-      if (send_macs)
-        get_Sp_ref(registers[i]).pack(socket_stream);
-      else
-        get_Sp_ref(registers[i]).pack(socket_stream,
-            sint::get_rec_factor(P.my_num(), P.num_players()));
+    if (reg_type == SINT) {
+      // Send vector of secret shares
+      get_Sp_ref(registers[i]).pack(socket_stream,
+          sint::get_rec_factor(P.my_num(), P.num_players()));
     }
-    else if (reg_type == MODP && secrecy_type == CLEAR) {
+    else if (reg_type == CINT) {
       // Send vector of clear public field elements
       get_Cp_ref(registers[i]).pack(socket_stream);
     }
-    else if (reg_type == INT && secrecy_type == CLEAR) {
+    else if (reg_type == INT) {
       // Send vector of 32-bit clear ints
       socket_stream.store((int&)get_Ci_ref(registers[i]));
     } 
     else {
       stringstream ss;
       ss << "Write socket instruction with unknown reg type " << reg_type << 
-        " and secrecy type " << secrecy_type << "." << endl;      
+        "." << endl;
       throw Processor_Error(ss.str());
     }
   }
diff --git a/Programs/Source/benchmark_mobilenet.mpc b/Programs/Source/benchmark_mobilenet.mpc
index 9ed1c39d..63e20f7c 100644
--- a/Programs/Source/benchmark_mobilenet.mpc
+++ b/Programs/Source/benchmark_mobilenet.mpc
@@ -1,4 +1,11 @@
-import ml
+import ml, sys
+
+if len(program.args) < 2:
+   print("You need to identify a concrete network such as 'v1_0.25_128'.",
+         file=sys.stderr)
+   print("Refer to https://github.com/anderspkd/SecureQ8 for scripts to run "
+         "this benchmark.", file=sys.stderr)
+   exit(1)
 
 network = program.args[1]
 
diff --git a/Programs/Source/gc_oram.mpc b/Programs/Source/gc_oram.mpc
index 558e6bc5..058a37aa 100644
--- a/Programs/Source/gc_oram.mpc
+++ b/Programs/Source/gc_oram.mpc
@@ -3,7 +3,7 @@ prog = program
 from Compiler.GC.types import *
 from Compiler.GC.instructions import *
 
-bits.unit = 128
+bits.unit = 64
 
 program.to_merge = [ldmsdi, stmsdi, ldmsd, stmsd, stmsdci, xors, andrs]
 program.stop_class = type(None)
@@ -11,7 +11,7 @@ program.stop_class = type(None)
 from Compiler.circuit_oram import *
 from Compiler import circuit_oram
 
-from Compiler import oram
+import oram
 oram.n_threads = 1
 oram.n_threads_for_tree = 1
 
diff --git a/Protocols/FakeShare.h b/Protocols/FakeShare.h
index 36b0b4d1..8b083a7f 100644
--- a/Protocols/FakeShare.h
+++ b/Protocols/FakeShare.h
@@ -58,16 +58,6 @@ public:
     {
     }
 
-    void add(T a, T b, int = 0, T = {})
-    {
-        *this = a + b;
-    }
-
-    void sub(T a, T b, int = 0, T = {})
-    {
-        *this = a - b;
-    }
-
     static void split(vector<bit_type>& dest, const vector<int>& regs,
             int n_bits, const This* source, int n_inputs,
             GC::FakeSecret::Protocol& protocol);
diff --git a/Protocols/MAC_Check.h b/Protocols/MAC_Check.h
index 104a0b50..d17eeef9 100644
--- a/Protocols/MAC_Check.h
+++ b/Protocols/MAC_Check.h
@@ -161,15 +161,6 @@ public:
   void exchange(const Player& P);
 };
 
-template <class T>
-class Passing_MAC_Check : public Direct_MAC_Check<T>
-{
-public:
-  Passing_MAC_Check(const typename T::mac_key_type::Scalar& ai);
-
-  void exchange(const Player& P);
-};
-
 
 enum mc_timer { SEND, RECV_ADD, BCAST, RECV_SUM, SEED, COMMIT, WAIT_SUMMER, RECV, SUM, SELECT, MAX_TIMER };
 
diff --git a/Protocols/MAC_Check.hpp b/Protocols/MAC_Check.hpp
index 5eee59bf..10cc3b45 100644
--- a/Protocols/MAC_Check.hpp
+++ b/Protocols/MAC_Check.hpp
@@ -390,25 +390,6 @@ void Direct_MAC_Check<T>::exchange(const Player& P)
   this->CheckIfNeeded(P);
 }
 
-template<class T>
-Passing_MAC_Check<T>::Passing_MAC_Check(const typename T::mac_key_type::Scalar& ai) :
-    Direct_MAC_Check<T>(ai)
-{
-}
-
-template<class T>
-void passing_add_openings(vector<T>& values, octetStream& os)
-{
-  octetStream new_os;
-  for (unsigned int i=0; i<values.size(); i++)
-    {
-      T tmp;
-      tmp.unpack(os);
-      (tmp + values[i]).pack(new_os);
-    }
-  os = new_os;
-}
-
 template<class T>
 void Direct_MAC_Check<T>::init_open(const Player& P, int n)
 {
@@ -422,20 +403,4 @@ void Direct_MAC_Check<T>::prepare_open(const T& secret)
   this->macs.push_back(secret.get_mac());
 }
 
-template<class T>
-void Passing_MAC_Check<T>::exchange(const Player& P)
-{
-  this->pre_exchange(P);
-  for (int i = 0; i < P.num_players() - 1; i++)
-    {
-      P.pass_around(this->os);
-      passing_add_openings(this->values, this->os);
-    }
-  for (auto& x : this->values)
-    x.unpack(this->os);
-  this->AddToValues(this->values);
-  this->popen_cnt += this->values.size();
-  this->CheckIfNeeded(P);
-}
-
 #endif
diff --git a/Protocols/MalRepRingPrep.hpp b/Protocols/MalRepRingPrep.hpp
index 2dc93cbc..ce34b64e 100644
--- a/Protocols/MalRepRingPrep.hpp
+++ b/Protocols/MalRepRingPrep.hpp
@@ -125,7 +125,6 @@ template<class U>
 void ShuffleSacrifice::shuffle(vector<U>& check_triples, Player& P)
 {
     int buffer_size = check_triples.size();
-    assert(buffer_size >= minimum_n_inputs());
 
     // shuffle
     GlobalPRNG G(P);
@@ -137,13 +136,24 @@ void ShuffleSacrifice::shuffle(vector<U>& check_triples, Player& P)
     }
 }
 
+template<class T>
+TripleShuffleSacrifice<T>::TripleShuffleSacrifice()
+{
+}
+
+template<class T>
+TripleShuffleSacrifice<T>::TripleShuffleSacrifice(int B, int C) :
+        ShuffleSacrifice(B, C)
+{
+}
+
 template<class T>
 void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
         vector<array<T, 3>>& check_triples, Player& P,
         typename T::MAC_Check& MC, ThreadQueues* queues)
 {
     int buffer_size = check_triples.size();
-    int N = (buffer_size - C) / B;
+    size_t N = (buffer_size - C) / B;
 
     shuffle(check_triples, P);
 
@@ -161,7 +171,9 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
         if (typename T::clear(opened[3 * i] * opened[3 * i + 1]) != opened[3 * i + 2])
             throw Offline_Check_Error("shuffle opening");
 
-    triples.resize(N);
+    // triples might be same as check_triples
+    if (triples.size() < N)
+        triples.resize(N);
 
     if (queues)
     {
@@ -172,6 +184,8 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
     }
     else
         triple_sacrifice(triples, check_triples, P, MC, 0, N);
+
+    triples.resize(N);
 }
 
 template<class T>
@@ -188,6 +202,7 @@ void TripleShuffleSacrifice<T>::triple_sacrifice(vector<array<T, 3>>& triples,
     int N = buffer_size / B;
     int size = end - begin;
     masked.reserve(2 * size);
+    assert(size_t(end * B) <= check_triples.size());
     for (int i = begin; i < end; i++)
     {
         T& a = check_triples[i][0];
diff --git a/Protocols/ReplicatedPrep.hpp b/Protocols/ReplicatedPrep.hpp
index f08e10ea..efd55be1 100644
--- a/Protocols/ReplicatedPrep.hpp
+++ b/Protocols/ReplicatedPrep.hpp
@@ -326,7 +326,8 @@ void buffer_bits_spec(ReplicatedPrep<T<gfp_<X, L>>>& prep, vector<T<gfp_<X, L>>>
     typename T<gfp_<X, L>>::Protocol& prot)
 {
     (void) bits, (void) prot;
-    if (prot.get_n_relevant_players() > 10)
+    if (prot.get_n_relevant_players() > 10
+            or OnlineOptions::singleton.bits_from_squares)
         buffer_bits_from_squares(prep);
     else
         prep.ReplicatedRingPrep<T<gfp_<X, L>>>::buffer_bits();
diff --git a/Protocols/ShamirShare.h b/Protocols/ShamirShare.h
index c84d4175..921e4888 100644
--- a/Protocols/ShamirShare.h
+++ b/Protocols/ShamirShare.h
@@ -101,15 +101,6 @@ public:
         T::assign(buffer);
     }
 
-    void add(const ShamirShare& x, const ShamirShare& y)
-    {
-        *this = x + y;
-    }
-    void sub(const ShamirShare& x, const ShamirShare& y)
-    {
-        *this = x - y;
-    }
-
     void add(const ShamirShare& S, const clear aa, int my_num,
             const T& alphai)
     {
diff --git a/Protocols/Share.h b/Protocols/Share.h
index d47da0d6..97fddcc9 100644
--- a/Protocols/Share.h
+++ b/Protocols/Share.h
@@ -18,7 +18,6 @@ template<class T> class Share;
 
 template<class T> class MAC_Check_;
 template<class T> class Direct_MAC_Check;
-template<class T> class Passing_MAC_Check;
 template<class T> class MascotMultiplier;
 template<class T> class MascotFieldPrep;
 template<class T> class MascotTripleGenerator;
diff --git a/Protocols/ShuffleSacrifice.h b/Protocols/ShuffleSacrifice.h
index 304091f3..e133d790 100644
--- a/Protocols/ShuffleSacrifice.h
+++ b/Protocols/ShuffleSacrifice.h
@@ -27,6 +27,7 @@ public:
     const int C;
 
     ShuffleSacrifice();
+    ShuffleSacrifice(int B, int C);
 
     int minimum_n_inputs(int n_outputs = 1)
     {
@@ -56,6 +57,9 @@ template<class T>
 class TripleShuffleSacrifice : public ShuffleSacrifice
 {
 public:
+    TripleShuffleSacrifice();
+    TripleShuffleSacrifice(int B, int C);
+
     void triple_sacrifice(vector<array<T, 3>>& triples,
             vector<array<T, 3>>& check_triples, Player& P,
             typename T::MAC_Check& MC, ThreadQueues* queues = 0);
diff --git a/Protocols/ShuffleSacrifice.hpp b/Protocols/ShuffleSacrifice.hpp
index 0808b752..ebd42afd 100644
--- a/Protocols/ShuffleSacrifice.hpp
+++ b/Protocols/ShuffleSacrifice.hpp
@@ -19,6 +19,12 @@ ShuffleSacrifice::ShuffleSacrifice() :
 {
 }
 
+inline
+ShuffleSacrifice::ShuffleSacrifice(int B, int C) :
+        B(B), C(C)
+{
+}
+
 template<class T>
 void TripleShuffleSacrifice<T>::triple_combine(vector<array<T, 3> >& triples,
         vector<array<T, 3> >& to_combine, Player& P,
diff --git a/README.md b/README.md
index ec057af1..d123a053 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ parties and malicious security.
 On Linux, this requires a working toolchain and [all
 requirements](#requirements). On Ubuntu, the following might suffice:
 ```
-apt-get install automake build-essential git libboost-dev libboost-thread-dev libsodium-dev libssl-dev libtool m4 python texinfo yasm
+apt-get install automake build-essential git libboost-dev libboost-thread-dev libntl-dev libsodium-dev libssl-dev libtool m4 python3 texinfo yasm
 ```
 On MacOS, this requires [brew](https://brew.sh) to be installed,
 which will be used for all dependencies.
@@ -77,13 +77,74 @@ The following table lists all protocols that are fully supported.
 | Malicious, dishonest majority | [MASCOT / LowGear / HighGear](#secret-sharing) | [SPDZ2k](#secret-sharing) | [Tiny / Tinier](#secret-sharing) | [BMR](#bmr) |
 | Covert, dishonest majority | [CowGear / ChaiGear](#secret-sharing) | N/A | N/A | N/A |
 | Semi-honest, dishonest majority | [Semi / Hemi / Soho](#secret-sharing) | [Semi2k](#secret-sharing) | [SemiBin](#secret-sharing) | [Yao's GC](#yaos-garbled-circuits) / [BMR](#bmr) |
-| Malicious, honest majority | [Shamir / Rep3 / PS / SY](#honest-majority) | [Brain / Rep[34] / PS / SY](#honest-majority) | [Rep3 / CCD](#honest-majority) | [BMR](#bmr) |
+| Malicious, honest majority | [Shamir / Rep3 / PS / SY](#honest-majority) | [Brain / Rep[34] / PS / SY](#honest-majority) | [Rep3 / CCD / PS](#honest-majority) | [BMR](#bmr) |
 | Semi-honest, honest majority | [Shamir / Rep3](#honest-majority) | [Rep3](#honest-majority) | [Rep3 / CCD](#honest-majority) | [BMR](#bmr) |
 
 See [this paper](https://eprint.iacr.org/2020/300) for an explanation
 of the various security models and high-level introduction to
 multi-party computation.
 
+##### Finding the most efficient protocol
+
+Lower security requirements generally allow for more efficient
+protocols. Within the same security model (line in the table above),
+there are a few things to consider:
+
+- Computation domain: Arithmetic protocols (modulo prime or power of
+  two) are preferable for many applications because they offer integer
+  addition and multiplication at low cost. However, binary circuits
+  might a better option if there is very little integer
+  computation. [See below](#finding-the-most-efficient-variant) to
+  find the most efficient mixed-circuit variant.  Furthermore, local
+  computation modulo a power of two is cheaper, but MP-SPDZ does not
+  offer this domain with homomorphic encryption.
+
+- Secret sharing vs garbled circuits: Computation using secret sharing
+  requires a number of communication rounds that grows depending on
+  the computation, which is not the case for garbled
+  circuits. However, the cost of integer computation as a binary
+  circuit often offset this. MP-SPDZ only offers garbled circuit
+  with binary computation.
+
+- Underlying technology for dishonest majority: While secret sharing
+  alone suffice honest-majority computation, dishonest majority
+  requires either homomorphic encryption (HE) or oblivious transfer
+  (OT). The two options offer a computation-communication trade-off:
+  While OT is easier to compute, HE requires less
+  communication. Furthermore, the latter requires a certain of
+  batching to be efficient, which makes OT preferable for smaller
+  tasks.
+
+- Malicious, honest-majority three-party computation: A number of
+  protocols are available for this setting, but SY/SPDZ-wise is the
+  most efficient one for a number of reasons: It requires the lowest
+  communication, and it is the only one offering constant-communication
+  dot products.
+
+- Minor variants: Some command-line options change aspects of the
+  protocols such as:
+
+  - `--bucket-size`: In some malicious binary computation and
+    malicious edaBit generation, a smaller bucket size allows
+    preprocessing in smaller batches at a higher asymptotic cost.
+
+  - `--batch-size`: Preprocessing in smaller batches avoids generating
+    too much but larger batches save communication rounds.
+
+  - `--direct`: In dishonest-majority protocols, direct communication
+    instead of star-shaped saves communication rounds at the expense
+    of a quadratic amount. This might be beneficial with a small
+    number of parties.
+
+  - `--bits-from-squares`: In some protocols computing modulo a prime
+    (Shamir, Rep3, SPDZ-wise), this switches from generating random
+    bits via XOR of parties' inputs to generation using the root of a
+    random square.
+
+  - `--top-gear`: In protocols with malicious security using
+    homomorphic encryption, this reduces the memory usage and batch
+    size for preprocessing.
+
 #### Paper and Citation
 
 The design of MP-SPDZ is described in [this
@@ -151,13 +212,16 @@ phase outputs the amount of offline material required, which allows to
 compute the preprocessing time for a particular computation.
 
 #### Requirements
- - GCC 5 or later (tested with up to 10) or LLVM/clang 5 or later (tested with up to 11). We recommend clang because it performs better.
+
+ - GCC 5 or later (tested with up to 10) or LLVM/clang 5 or later
+   (only x86; tested with up to 11). For x86, we recommend clang
+   because it performs better.
  - MPIR library, compiled with C++ support (use flag `--enable-cxx` when running configure). You can use `make -j8 tldr` to install it locally.
  - libsodium library, tested against 1.0.16
  - OpenSSL, tested against 1.1.1
  - Boost.Asio with SSL support (`libboost-dev` on Ubuntu), tested against 1.65
  - Boost.Thread for BMR (`libboost-thread-dev` on Ubuntu), tested against 1.65
- - 64-bit CPU
+ - x86 or ARM 64-bit CPU (the latter tested with AWS Gravitron)
  - Python 3.5 or later
  - NTL library for homomorphic encryption (optional; tested with NTL 10.5)
  - If using macOS, Sierra or later
@@ -168,13 +232,14 @@ compute the preprocessing time for a particular computation.
 
  - By default, the binaries are optimized for the CPU you are
    compiling on.
-   For all optimizations, a CPU supporting AES-NI, PCLMUL, AVX2, BMI2, ADX is
+   For all optimizations on x86, a CPU supporting AES-NI, PCLMUL, AVX2, BMI2, ADX is
    required. This includes mainstream processors released 2014 or later.
    If you intend to run on a different CPU than compiling, you might
    need to change the `ARCH` variable in `CONFIG` or `CONFIG.mine` to
    `-march=<cpu>`. See the [GCC
    documentation](https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html)
-   for the possible options.
+   for the possible options. To run OT-based protocols on x86 without AVX,
+   add `AVX_OT = 0` in addition.
  - To benchmark online-only protocols or Overdrive offline phases, add the following line at the top: `MY_CFLAGS = -DINSECURE`
  - `PREP_DIR` should point to a local, unversioned directory to store preprocessing data (the default is `Player-Data` in the current directory).
  - For homomorphic encryption, set `USE_NTL = 1`.
@@ -299,6 +364,19 @@ compiler where `n` is the number of parties for the standard variant
 and 2 for the special
 variant by Mohassel and Rindal (available in Rep3 only).
 
+##### Finding the most efficient variant
+
+Where available, local share conversion is likely the most efficient
+variant. Protocols based on Shamir secret sharing are unlikely to
+benefit from mixed-circuit computation because they use an extension
+field for binary computation. Otherwise, edaBits likely offer an
+asymptotic benefit. However, malicious protocols by default generate
+large batches of edaBits (more than one million at once), which is
+only worthwhile for accordingly large computation. For smaller
+computation, try running the virtual machines with `-B 4` or `-B 5`,
+which reduces the batch size to ~10,000 and ~1,000, respectively, at a
+higher asymptotic cost.
+
 #### Bristol Fashion circuits
 
 Bristol Fashion is the name of a description format of binary circuits
@@ -386,7 +464,8 @@ This runs the compiled bytecode in cleartext computation.
 
 Some full implementations require oblivious transfer, which is
 implemented as OT extension based on
-https://github.com/mkskeller/SimpleOT.
+https://github.com/mkskeller/SimpleOT or OpenSSL (activate the
+latter with `AVX_OT = 0` in `CONFIG` or `CONFIG.mine`).
 
 ### Secret sharing
 
@@ -524,6 +603,7 @@ The following table shows all programs for honest-majority computation:
 | `rep4-ring-party.x` | Replicated | Mod 2^k | Y | 4 | `rep4-ring.sh` |
 | `replicated-bin-party.x` | Replicated | Binary | N | 3 | `replicated.sh` |
 | `malicious-rep-bin-party.x` | Replicated | Binary | Y | 3 | `mal-rep-bin.sh` |
+| `ps-rep-bin-party.x` | Replicated | Binary | Y | 3 | `ps-rep-bin.sh` |
 | `replicated-field-party.x` | Replicated | Mod prime | N | 3 | `rep-field.sh` |
 | `ps-rep-field-party.x` | Replicated | Mod prime | Y | 3 | `ps-rep-field.sh` |
 | `sy-rep-field-party.x` | SPDZ-wise replicated | Mod prime | Y | 3 | `sy-rep-field.sh` |
@@ -537,7 +617,7 @@ The following table shows all programs for honest-majority computation:
 We use the "generate random triple optimistically/sacrifice/Beaver"
 methodology described by [Lindell and
 Nof](https://eprint.iacr.org/2017/816) to achieve malicious
-security with plain replicated secret sharing,
+security with plain arithmetic replicated secret sharing,
 except for the "PS" (post-sacrifice) protocols where the
 actual multiplication is executed optimistically and checked later as
 also described by Lindell and Nof.
@@ -563,6 +643,13 @@ secret value and information-theoretic tag similar to SPDZ but not
 with additive secret sharing, hence the name.
 Rep4 refers to the four-party protocol by [Dalskov et
 al.](https://eprint.iacr.org/2020/1330).
+`malicious-rep-bin-party.x` is based on cut-and-choose triple
+generation by [Furukawa et al.](https://eprint.iacr.org/2016/944) but
+using Beaver multiplication instead of their post-sacrifice
+approach. `ps-rep-bin-party.x` is based on the post-sacrifice approach
+by [Araki et
+al.](https://www.ieee-security.org/TC/SP2017/papers/96.pdf) but
+without using their cache optimization.
 
 All protocols in this section require encrypted channels because the
 information received by the honest majority suffices the reconstruct
diff --git a/Scripts/bmr-program-run.sh b/Scripts/bmr-program-run.sh
index 0e803acf..9d7b9e33 100755
--- a/Scripts/bmr-program-run.sh
+++ b/Scripts/bmr-program-run.sh
@@ -35,5 +35,7 @@ done
 $prefix ./bmr-program-tparty.x $prog $netmap 2>&1 &> bmr-log/t &
 for i in $(seq $[n_players-1]); do
     $prefix ./bmr-program-party.x $i $prog $netmap $threshold 2>&1 &> bmr-log/$i &
+    id=$!
 done
 $prefix ./bmr-program-party.x $n_players $prog $netmap $threshold 2>&1 | tee bmr-log/$n_players
+wait $id
diff --git a/Scripts/ccd.sh b/Scripts/ccd.sh
index e911fb72..1b5c9545 100755
--- a/Scripts/ccd.sh
+++ b/Scripts/ccd.sh
@@ -3,7 +3,7 @@
 HERE=$(cd `dirname $0`; pwd)
 SPDZROOT=$HERE/..
 
-export PLAYERS=3
+export PLAYERS=${PLAYERS:-3}
 
 . $HERE/run-common.sh
 
diff --git a/Scripts/mal-ccd.sh b/Scripts/mal-ccd.sh
index f01d6d7d..b6fee5e3 100755
--- a/Scripts/mal-ccd.sh
+++ b/Scripts/mal-ccd.sh
@@ -3,7 +3,7 @@
 HERE=$(cd `dirname $0`; pwd)
 SPDZROOT=$HERE/..
 
-export PLAYERS=3
+export PLAYERS=${PLAYERS:-3}
 
 . $HERE/run-common.sh
 
diff --git a/Scripts/ps-rep-bin.sh b/Scripts/ps-rep-bin.sh
new file mode 100755
index 00000000..fc411b28
--- /dev/null
+++ b/Scripts/ps-rep-bin.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+HERE=$(cd `dirname $0`; pwd)
+SPDZROOT=$HERE/..
+
+export PLAYERS=3
+
+. $HERE/run-common.sh
+
+run_player ps-rep-bin-party.x $* || exit 1
diff --git a/Scripts/test_tutorial.sh b/Scripts/test_tutorial.sh
index 8ac1ef4d..7eb9acc0 100755
--- a/Scripts/test_tutorial.sh
+++ b/Scripts/test_tutorial.sh
@@ -7,6 +7,8 @@ while getopts XYC opt; do
 	   ;;
 	Y) dabit=2
 	   ;;
+	C) cont=1
+	   ;;
     esac
 done
 
@@ -31,7 +33,7 @@ function test_vm
 	    echo == Party $i
 	    cat logs/tutorial-$i
 	done
-	exit 1
+	test -z $cont && exit 1
     fi
 }
 
@@ -88,7 +90,7 @@ fi
 
 ./compile.py -B 16  $compile_opts tutorial
 
-for i in replicated mal-rep-bin semi-bin ccd mal-ccd; do
+for i in replicated mal-rep-bin ps-rep-bin semi-bin ccd mal-ccd; do
     test_vm $i $run_opts
 done
 
diff --git a/Tools/BitVector.cpp b/Tools/BitVector.cpp
index 1e8a987e..4ef3406f 100644
--- a/Tools/BitVector.cpp
+++ b/Tools/BitVector.cpp
@@ -41,7 +41,7 @@ BitVector BitVector::operator &(const BitVector& other) const
 
 bool BitVector::parity() const
 {
-#if defined(__SSE4_2__) or not defined(__clang__)
+#if (defined(__SSE4_2__) or not defined(__clang__)) and defined(__x86_64__)
     bool res = 0;
     for (size_t i = 0; i < size_bytes() / 8; i++)
         res ^= _popcnt64(((word*)bytes)[i]) & 1;
@@ -49,7 +49,17 @@ bool BitVector::parity() const
         res ^= _popcnt32(bytes[i]) & 1;
     return res;
 #else
-    throw runtime_error("need to compile with SSE4.2 support or GCC");
+    bool res = 0;
+    for (size_t i = 0; i < size_bytes() / 8; i++)
+    {
+        word x = ((word*)bytes)[i];
+        for (int i = 5; i >= 0; i--)
+            x ^= (x >> (1 << i));
+        res ^= (x & 1);
+    }
+    for (size_t i = size_bytes() / 8 * 8; i < size_bytes(); i++)
+        res ^= (*this)[i];
+    return res;
 #endif
 }
 
@@ -131,12 +141,19 @@ void BitVector::input(istream& s,bool human)
 
 void BitVector::pack(octetStream& o) const
 {
-    o.store(nbytes);
+    o.store_int(nbits, 8);
     o.append((octet*)bytes, nbytes);
 }
 
 void BitVector::unpack(octetStream& o)
 {
-    o.get(nbytes);
+    resize(o.get_int(8));
     o.consume((octet*)bytes, nbytes);
 }
+
+BitVector& BitVector::operator =(const octetStream other)
+{
+    resize(other.get_length() * 8);
+    memcpy(bytes, other.get_data(), nbytes);
+    return *this;
+}
diff --git a/Tools/BitVector.h b/Tools/BitVector.h
index 98c2cccf..05561051 100644
--- a/Tools/BitVector.h
+++ b/Tools/BitVector.h
@@ -7,7 +7,6 @@
 #include <vector>
 using namespace std;
 #include <stdlib.h>
-#include <pmmintrin.h>
 #include <assert.h>
 
 #include "Tools/Exceptions.h"
@@ -15,6 +14,7 @@ using namespace std;
 // just for util functions
 #include "Math/gf2nlong.h"
 #include "Math/FixedVec.h"
+#include "Tools/intrinsics.h"
 
 class PRNG;
 class octetStream;
@@ -137,6 +137,8 @@ class BitVector
         return *this;
     }
 
+    BitVector& operator=(const octetStream other);
+
     void swap(BitVector& other)
     {
         std::swap(nbits, other.nbits);
@@ -156,7 +158,7 @@ class BitVector
         void operator=(const Access& other) { *this = other.get(); }
         void operator^=(const Access& other) { *this = get() ^ other.get(); }
         bool operator==(const Access& other) const { return get() == other.get(); }
-        bool operator==(bool b) const { return get() == b; }
+        operator bool() const { return get(); }
     };
 
     bool operator[](int i) const { return get_bit(i); }
@@ -242,6 +244,11 @@ class BitVector
         return true;
     }
 
+    bool operator==(const BitVector& other)
+    {
+        return equals(other);
+    }
+
     void append(const BitVector& other, size_t length);
 
     void randomize(PRNG& G);
diff --git a/Tools/aes-arm.h b/Tools/aes-arm.h
new file mode 100644
index 00000000..0eacbe00
--- /dev/null
+++ b/Tools/aes-arm.h
@@ -0,0 +1,328 @@
+// This file is reduced to functionality necessary for AES in order to avoid
+// conflicts with simde.
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&   \
+    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
+     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
diff --git a/Tools/aes-ni.cpp b/Tools/aes-ni.cpp
index 49d4c13a..33de7cef 100644
--- a/Tools/aes-ni.cpp
+++ b/Tools/aes-ni.cpp
@@ -21,7 +21,7 @@ inline __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2)
 
 void aes_128_schedule( octet* key, const octet* userkey )
 {
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
   if (cpu_has_aes())
   {
     __m128i temp1, temp2;
@@ -64,7 +64,7 @@ void aes_128_schedule( octet* key, const octet* userkey )
     aes_128_schedule((uint*) key, userkey);
 }
 
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
 inline void KEY_192_ASSIST(__m128i* temp1, __m128i * temp2, __m128i * temp3)
 { __m128i temp4;
   *temp2 = _mm_shuffle_epi32 (*temp2, 0x55);
diff --git a/Tools/aes.h b/Tools/aes.h
index 7ef6c3d2..07ab13d3 100644
--- a/Tools/aes.h
+++ b/Tools/aes.h
@@ -1,10 +1,9 @@
 #ifndef __AES_H
 #define __AES_H
 
-#include <wmmintrin.h>
-
 #include "Networking/data.h"
 #include "cpu_support.h"
+#include "intrinsics.h"
 
 typedef unsigned int  uint;
 
@@ -54,7 +53,7 @@ __attribute__((optimize("unroll-loops")))
 #endif
 inline __m128i aes_128_encrypt(__m128i in, const octet* key)
 {
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
     if (cpu_has_aes())
     {
         __m128i& tmp = in;
@@ -87,7 +86,7 @@ __attribute__((optimize("unroll-loops")))
 #endif
 inline void ecb_aes_128_encrypt(__m128i* out, const __m128i* in, const octet* key)
 {
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
     if (cpu_has_aes())
     {
         __m128i tmp[N];
diff --git a/Tools/avx_memcpy.h b/Tools/avx_memcpy.h
index ad3a1207..231dc99c 100644
--- a/Tools/avx_memcpy.h
+++ b/Tools/avx_memcpy.h
@@ -6,10 +6,11 @@
 #ifndef TOOLS_AVX_MEMCPY_H_
 #define TOOLS_AVX_MEMCPY_H_
 
-#include <immintrin.h>
 #include <string.h>
 #include <cstdint>
 
+#include "intrinsics.h"
+
 inline void avx_memcpy(void* dest, const void* source, size_t length)
 {
 	memcpy(dest, source, length);
diff --git a/Tools/cpu_support.h b/Tools/cpu_support.h
index 405bb1f2..755c302d 100644
--- a/Tools/cpu_support.h
+++ b/Tools/cpu_support.h
@@ -41,13 +41,14 @@ inline bool cpu_has_avx2()
 #endif
 }
 
-inline bool cpu_has_avx()
+inline bool cpu_has_avx(bool force = false)
 {
-#ifdef CHECK_AVX
-    return check_cpu(1, true, 28);
-#else
-    return true;
+    (void) force;
+#ifndef CHECK_AVX
+    if (force)
 #endif
+        return check_cpu(1, true, 28);
+    return true;
 }
 
 inline bool cpu_has_pclmul()
diff --git a/Tools/intrinsics.h b/Tools/intrinsics.h
new file mode 100644
index 00000000..45664a72
--- /dev/null
+++ b/Tools/intrinsics.h
@@ -0,0 +1,22 @@
+/*
+ * intrinsics.h
+ *
+ */
+
+#ifndef TOOLS_INTRINSICS_H_
+#define TOOLS_INTRINSICS_H_
+
+#ifdef __x86_64__
+#include <immintrin.h>
+#include <x86intrin.h>
+#else
+#define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES
+#define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES
+#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
+#define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
+#include "simde/simde/x86/avx2.h"
+#include "simde/simde/x86/clmul.h"
+#include "aes-arm.h"
+#endif
+
+#endif /* TOOLS_INTRINSICS_H_ */
diff --git a/Tools/random.cpp b/Tools/random.cpp
index ff24c94f..bcd3ba4e 100644
--- a/Tools/random.cpp
+++ b/Tools/random.cpp
@@ -15,7 +15,7 @@ using namespace std;
 PRNG::PRNG() :
     cnt(0), n_cached_bits(0), cached_bits(0)
 {
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
   #ifdef USE_AES
     useC=(Check_CPU_support_AES()==0);
   #endif
diff --git a/Tools/random.h b/Tools/random.h
index 67314db8..387a0708 100644
--- a/Tools/random.h
+++ b/Tools/random.h
@@ -16,7 +16,7 @@
   #define SEED_SIZE   randombytes_SEEDBYTES
   #define RAND_SIZE   480
 #else
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
   #define PIPELINES   8
 #else
   #define PIPELINES   1
@@ -44,7 +44,7 @@ class PRNG
    octet random[RAND_SIZE] __attribute__((aligned (16)));
 
    #ifdef USE_AES
-#ifdef __AES__
+#if defined(__AES__) || !defined(__x86_64__)
      bool useC;
 #else
      const static bool useC = true;
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5854058b..e617937a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -11,7 +11,7 @@ pool:
 
 steps:
   - script: |
-      bash -c "sudo apt-get update && sudo apt-get install libsodium-dev libntl-dev yasm texinfo libboost-dev libboost-thread-dev python3-gmpy2 libcrypto++-dev python-networkx python3-networkx python3-sphinx"
+      bash -c "sudo apt-get update && sudo apt-get install libsodium-dev libntl-dev yasm texinfo libboost-dev libboost-thread-dev python3-gmpy2 python3-networkx python3-sphinx"
   - script: |
       make mpir
   - script:
diff --git a/doc/Compiler.rst b/doc/Compiler.rst
index 33b19404..ac57a07f 100644
--- a/doc/Compiler.rst
+++ b/doc/Compiler.rst
@@ -20,11 +20,8 @@ Compiler.types module
 		     ClientMessageType, __weakref__, __repr__,
 		     reg_type, int_type, clear_type, float_type, basic_type,
 		     default_type, unreduced_type, bit_type, dynamic_array,
-		     squant, mov, load_mem, store_in_mem, receive_from_client,
-		     read_from_socket, write_to_socket, init_secure_socket,
-		     read_client_public_key, resp_secure_socket,
-		     resp_secure_socket, write_share_to_socket,
-		     write_shares_to_socket
+		     squant, mov, load_mem, store_in_mem,
+		     write_share_to_socket,
 
 Compiler.GC.types module
 ------------------------
diff --git a/doc/index.rst b/doc/index.rst
index d8440163..d56258bf 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -150,6 +150,7 @@ Reference
    Compiler
    instructions
    low-level
+   networking
 
 
 Indices and tables
diff --git a/doc/networking.rst b/doc/networking.rst
new file mode 100644
index 00000000..10140b7f
--- /dev/null
+++ b/doc/networking.rst
@@ -0,0 +1,30 @@
+Networking
+----------
+
+All protocols in MP-SPDZ rely on point-to-point connections between
+all pairs of parties. This is realized using TCP, which means that
+every party must be reachable under at least one TCP port. The default
+is to set this port to a base plus the player number. This allows to
+easily run all parties on the same host. The base defaults to 5000,
+which can be changed with the command-line option
+``--portnumbase``. There are two ways of communicating hosts and
+individually setting ports:
+
+1. All parties first to connect to a coordination server, which
+   broadcasts the data for all parties. This is the default with the
+   coordination server being run as a thread of party 0. The hostname
+   of the coordination server has to be given with the command-line
+   parameter ``--hostname``, and the coordination server runs on the
+   base port number minus one, thus defaulting to 4999. Furthermore, you
+   can specify a party's listening port using ``--my-port``.
+
+2. The parties read the information from a local file, which needs to
+   be same everywhere. The file can be specified using
+   ``--ip-file-name`` and has the following format::
+
+     <host0>[:<port0>]
+     <host1>[:<port1>]
+     ...
+
+   The hosts can be both hostnames and IP addresses. If not given, the
+   ports default to base plus party number.
diff --git a/simde b/simde
new file mode 160000
index 00000000..0ba20859
--- /dev/null
+++ b/simde
@@ -0,0 +1 @@
+Subproject commit 0ba20859e9f557cee161d2cab2728b56dca7ec38