#include "Math/gf2n.h"

#include "Exceptions/Exceptions.h"

#include <stdint.h>
#include <wmmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>

int gf2n_short::n;
int gf2n_short::t1;
int gf2n_short::t2;
int gf2n_short::t3;
int gf2n_short::l0;
int gf2n_short::l1;
int gf2n_short::l2;
int gf2n_short::l3;
int gf2n_short::nterms;
word gf2n_short::mask;
bool gf2n_short::useC;
bool gf2n_short::rewind = false;

word gf2n_short_table[256][256];

#define num_2_fields 4

/* Require
 *  2*(n-1)-64+t1<64
 */
int fields_2[num_2_fields][4] = { 
	{4,1,0,0},{8,4,3,1},{28,1,0,0},{40,20,15,10}
    };


void gf2n_short::init_tables()
{
  if (sizeof(word)!=8)
    { cout << "Word size is wrong" << endl; 
      throw not_implemented();
    }
  int i,j;
  for (i=0; i<256; i++)
    { for (j=0; j<256; j++)
        { word ii=i,jj=j;
          gf2n_short_table[i][j]=0;
          while (ii!=0)
            { if ((ii&1)==1) { gf2n_short_table[i][j]^=jj; }
              jj<<=1;
              ii>>=1;
            }
         }
    }
}


void gf2n_short::init_field(int nn)
{
  if (nn == 0)
    {
      nn = default_length();
      cerr << "Using GF(2^" << nn << ")" << endl;
    }

  gf2n_short::init_tables();
  int i,j=-1;
  for (i=0; i<num_2_fields && j==-1; i++)
    { if (nn==fields_2[i][0]) { j=i; } }
  if (j==-1)
    {
      if (nn == 128)
	throw runtime_error("need to compile with USE_GF2N_LONG = 1; "
			    "remember to make clean");
      else
	throw runtime_error("field size not supported");
    }

  n=nn;
  nterms=1;
  l0=64-n;  
  t1=fields_2[j][1];
  l1=64+t1-n; 
  if (fields_2[j][2]!=0)
    { nterms=3;
      t2=fields_2[j][2];
      l2=64+t2-n; 
      t3=fields_2[j][3];
      l3=64+t3-n; 
    }
  if (2*(n-1)-64+t1>=64) { throw invalid_params(); }

  mask=(1ULL<<n)-1;

#ifdef __PCLMUL__
  useC=(Check_CPU_support_AES()==0);
#else
  useC = true;
#endif
}
  


/* Takes 16bit x and y and returns the 32 bit product in c1 and c0
      ans = (c1<<16)^c0
   where c1 and c0 are 16 bit
*/
inline void mul16(word x,word y,word& c0,word& c1)
{
  word a1=x&(0xFF), b1=y&(0xFF);
  word a2=x>>8,     b2=y>>8;

  c0=gf2n_short_table[a1][b1];
  c1=gf2n_short_table[a2][b2];
  word te=gf2n_short_table[a1][b2]^gf2n_short_table[a2][b1];
  c0^=(te&0xFF)<<8;
  c1^=te>>8;
}

/* Takes 16 bit x and y and returns the 32 bit product */
inline word mul16(word x,word y)
{
  word a1=x&(0xFF), b1=y&(0xFF);
  word a2=x>>8,     b2=y>>8;

  word ans=gf2n_short_table[a2][b2]<<8;
  ans^=gf2n_short_table[a1][b2]^gf2n_short_table[a2][b1];
  ans<<=8;
  ans^=gf2n_short_table[a1][b1];

  return ans;
}



/* Takes 16 bit x the 32 bit square */
inline word sqr16(word x)
{
  word a1=x&(0xFF),a2=x>>8;

  word ans=gf2n_short_table[a2][a2]<<16;
  ans^=gf2n_short_table[a1][a1];

  return ans;
}



void gf2n_short::reduce_trinomial(word xh,word xl)
{
  // Deal with xh first
  a=xl;
  a^=(xh<<l0); 
  a^=(xh<<l1); 

  // Now deal with last word
  word hi=a>>n;
  while (hi!=0)
    { a&=mask;

      a^=hi;
      a^=(hi<<t1);
      hi=a>>n;
    }
}

void gf2n_short::reduce_pentanomial(word xh,word xl)
{
  // Deal with xh first
  a=xl;
  a^=(xh<<l0);   
  a^=(xh<<l1);
  a^=(xh<<l2);
  a^=(xh<<l3);

  // Now deal with last word
  word hi=a>>n;
  while (hi!=0)
    { a&=mask;

      a^=hi;
      a^=(hi<<t1);
      a^=(hi<<t2);
      a^=(hi<<t3);

      hi=a>>n;
    }
}


void mul32(word x,word y,word& ans)
{
  word a1=x&(0xFFFF),b1=y&(0xFFFF);
  word a2=x>>16,     b2=y>>16;

  word c0,c1;

  ans=mul16(a1,b1);
  word upp=mul16(a2,b2);

  mul16(a1,b2,c0,c1);
  ans^=c0<<16;       upp^=c1;

  mul16(a2,b1,c0,c1);
  ans^=c0<<16;       upp^=c1;

  ans^=(upp<<32);
}



void gf2n_short::mul(const gf2n_short& x,const gf2n_short& y)
{
  word hi,lo;
  
  if (gf2n_short::useC)
    { /* Uses Karatsuba */
      word c,d,e,t;
      word xl=x.a&0xFFFFFFFF,yl=y.a&0xFFFFFFFF;
      word xh=x.a>>32,yh=y.a>>32;
      mul32(xl,yl,c);
      mul32(xh,yh,d);
      mul32((xl^xh),(yl^yh),e);
      t=c^e^d;
      lo=c^(t<<32);
      hi=d^(t>>32);
    }
  else
    { /* Use Intel Instructions */
#ifdef __PCLMUL__
      __m128i xx,yy,zz;
      uint64_t c[] __attribute__((aligned (16))) = { 0,0 };
      xx=_mm_set1_epi64x(x.a);
      yy=_mm_set1_epi64x(y.a);
      zz=_mm_clmulepi64_si128(xx,yy,0);
      _mm_store_si128((__m128i*)c,zz);
      lo=c[0];
      hi=c[1];
#else
      throw runtime_error("need to compile with PCLMUL support");
#endif
    }

  reduce(hi,lo);
}




inline void sqr32(word x,word& ans)
{
  word a1=x&(0xFFFF),a2=x>>16;
  ans=sqr16(a1)^(sqr16(a2)<<32);
}

void gf2n_short::square()
{
  word xh,xl;
  sqr32(a&0xFFFFFFFF,xl);
  sqr32(a>>32,xh);
  reduce(xh,xl);
}


void gf2n_short::square(const gf2n_short& bb)
{
  word xh,xl;
  sqr32(bb.a&0xFFFFFFFF,xl);
  sqr32(bb.a>>32,xh);
  reduce(xh,xl);
}






void gf2n_short::invert()
{
  if (is_one())  { return; }
  if (is_zero()) { throw division_by_zero(); }

  word u,v=a,B=0,D=1,mod=1;

  mod^=(1ULL<<n);
  mod^=(1ULL<<t1);
  if (nterms==3)
    { mod^=(1ULL<<t2);
      mod^=(1ULL<<t3);
    }
  u=mod; v=a;
  
  while (u!=0)
    { while ((u&1)==0)
	    { u>>=1;
          if ((B&1)!=0) { B^=mod; }
 	      B>>=1;
        }
      while ((v&1)==0 && v!=0)
	    { v>>=1;
	      if ((D&1)!=0) { D^=mod; }
		  D>>=1;
	    }

      if (u>=v) { u=u^v; B=B^D; }
	  else      { v=v^u; D=D^B; }
   }

  a=D;
}


void gf2n_short::power(long i)
{ 
  long n=i;
  if (n<0) { invert(); n=-n; }

  gf2n_short T=*this;
  assign_one();
  while (n!=0)
    { if ((n&1)!=0) { mul(*this,T); }
       n>>=1;
       T.square();
    }
}


void gf2n_short::randomize(PRNG& G)
{
  a=G.get_uint();
  a=(a<<32)^G.get_uint();
  a&=mask;
}


void gf2n_short::output(ostream& s,bool human) const
{
  if (human)
    { s << hex << showbase << a << dec << " "; }
  else
    { s.write((char*) &a,sizeof(word)); }
}

void gf2n_short::input(istream& s,bool human)
{
  if (s.peek() == EOF)
    { if (s.tellg() == 0)
        { cout << "IO problem. Empty file?" << endl;
          throw file_error();
        }
      throw end_of_file("gf2n_short");
    }

  if (human)
    { s >> hex >> a >> dec; } 
  else
    { s.read((char*) &a,sizeof(word)); }

  a &= mask;
}

// Expansion is by x=y^5+1 (as we embed GF(256) into GF(2^40)
void expand_byte(gf2n_short& a,int b)
{
  gf2n_short x,xp;
  x.assign(32+1);
  xp.assign_one();
  a.assign_zero();

  while (b!=0)
    { if ((b&1)==1)
        { a.add(a,xp); }
      xp.mul(x);
      b>>=1;
    }
}


// Have previously worked out the linear equations we need to solve
void collapse_byte(int& b,const gf2n_short& aa)
{
  word w=aa.get();
  int e35=(w>>35)&1;
  int e30=(w>>30)&1;
  int e25=(w>>25)&1;
  int e20=(w>>20)&1;
  int e15=(w>>15)&1;
  int e10=(w>>10)&1;
  int  e5=(w>>5)&1;
  int  e0=w&1;
  int a[8];
  a[7]=e35;
  a[6]=e30^a[7];
  a[5]=e25^a[7];
  a[4]=e20^a[5]^a[6]^a[7];
  a[3]=e15^a[7];
  a[2]=e10^a[3]^a[6]^a[7];
  a[1]=e5^a[3]^a[5]^a[7];
  a[0]=e0^a[1]^a[2]^a[3]^a[4]^a[5]^a[6]^a[7];

  b=0;
  for (int i=7; i>=0; i--)
    { b=b<<1;
      b+=a[i];
    }
}