You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@milagro.apache.org by br...@apache.org on 2018/11/08 00:12:58 UTC

[38/51] [partial] incubator-milagro-crypto-c git commit: update code

http://git-wip-us.apache.org/repos/asf/incubator-milagro-crypto-c/blob/8d28d2c3/src/ff.c.in
----------------------------------------------------------------------
diff --git a/src/ff.c.in b/src/ff.c.in
new file mode 100644
index 0000000..f4569e7
--- /dev/null
+++ b/src/ff.c.in
@@ -0,0 +1,1135 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/
+
+/* AMCL basic functions for Large Finite Field support */
+
+#include "ff_WWW.h"
+
+/* Arazi and Qi inversion mod 256 */
+static int invmod256(int a)
+{
+    int U,t1,t2,b,c;
+    t1=0;
+    c=(a>>1)&1;
+    t1+=c;
+    t1&=1;
+    t1=2-t1;
+    t1<<=1;
+    U=t1+1;
+
+    // i=2
+    b=a&3;
+    t1=U*b;
+    t1>>=2;
+    c=(a>>2)&3;
+    t2=(U*c)&3;
+    t1+=t2;
+    t1*=U;
+    t1&=3;
+    t1=4-t1;
+    t1<<=2;
+    U+=t1;
+
+    // i=4
+    b=a&15;
+    t1=U*b;
+    t1>>=4;
+    c=(a>>4)&15;
+    t2=(U*c)&15;
+    t1+=t2;
+    t1*=U;
+    t1&=15;
+    t1=16-t1;
+    t1<<=4;
+    U+=t1;
+
+    return U;
+}
+
+/* a=1/a mod 2^BIGBITS. This is very fast! */
+void BIG_XXX_invmod2m(BIG_XXX a)
+{
+    int i;
+    BIG_XXX U,t1,b,c;
+    BIG_XXX_zero(U);
+    BIG_XXX_inc(U,invmod256(BIG_XXX_lastbits(a,8)));
+    for (i=8; i<BIGBITS_XXX; i<<=1)
+    {
+        BIG_XXX_norm(U);
+        BIG_XXX_copy(b,a);
+        BIG_XXX_mod2m(b,i);   // bottom i bits of a
+
+        BIG_XXX_smul(t1,U,b);
+        BIG_XXX_shr(t1,i); // top i bits of U*b
+
+        BIG_XXX_copy(c,a);
+        BIG_XXX_shr(c,i);
+        BIG_XXX_mod2m(c,i); // top i bits of a
+
+        BIG_XXX_smul(b,U,c);
+        BIG_XXX_mod2m(b,i);  // bottom i bits of U*c
+
+        BIG_XXX_add(t1,t1,b);
+        BIG_XXX_norm(t1);
+        BIG_XXX_smul(b,t1,U);
+        BIG_XXX_copy(t1,b);  // (t1+b)*U
+        BIG_XXX_mod2m(t1,i);                // bottom i bits of (t1+b)*U
+
+        BIG_XXX_one(b);
+        BIG_XXX_shl(b,i);
+        BIG_XXX_sub(t1,b,t1);
+        BIG_XXX_norm(t1);
+
+        BIG_XXX_shl(t1,i);
+
+        BIG_XXX_add(U,U,t1);
+    }
+    BIG_XXX_copy(a,U);
+    BIG_XXX_norm(a);
+    BIG_XXX_mod2m(a,BIGBITS_XXX);
+}
+
+/* x=y */
+void FF_WWW_copy(BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_copy(x[i],y[i]);
+}
+
+/* x=y<<n */
+static void FF_WWW_dsucopy(BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_copy(x[n+i],y[i]);
+        BIG_XXX_zero(x[i]);
+    }
+}
+
+/* x=y */
+static void FF_WWW_dscopy(BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_copy(x[i],y[i]);
+        BIG_XXX_zero(x[n+i]);
+    }
+}
+
+/* x=y>>n */
+static void FF_WWW_sducopy(BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_copy(x[i],y[n+i]);
+}
+
+/* set to zero */
+void FF_WWW_zero(BIG_XXX x[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_zero(x[i]);
+}
+
+/* test equals 0 */
+int FF_WWW_iszilch(BIG_XXX x[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        if (!BIG_XXX_iszilch(x[i])) return 0;
+    return 1;
+}
+
+/* shift right by BIGBITS-bit words */
+static void FF_WWW_shrw(BIG_XXX a[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_copy(a[i],a[i+n]);
+        BIG_XXX_zero(a[i+n]);
+    }
+}
+
+/* shift left by BIGBITS-bit words */
+static void FF_WWW_shlw(BIG_XXX a[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_copy(a[i+n],a[i]);
+        BIG_XXX_zero(a[i]);
+    }
+}
+
+/* extract last bit */
+int FF_WWW_parity(BIG_XXX x[])
+{
+    return BIG_XXX_parity(x[0]);
+}
+
+/* extract last m bits */
+int FF_WWW_lastbits(BIG_XXX x[],int m)
+{
+    return BIG_XXX_lastbits(x[0],m);
+}
+
+/* x=1 */
+void FF_WWW_one(BIG_XXX x[],int n)
+{
+    int i;
+    BIG_XXX_one(x[0]);
+    for (i=1; i<n; i++)
+        BIG_XXX_zero(x[i]);
+}
+
+/* x=m, where m is 32-bit int */
+void FF_WWW_init(BIG_XXX x[],sign32 m,int n)
+{
+    int i;
+    BIG_XXX_zero(x[0]);
+#if CHUNK<64
+    x[0][0]=(chunk)(m&BMASK_XXX);
+    x[0][1]=(chunk)(m>>BASEBITS_XXX);
+#else
+    x[0][0]=(chunk)m;
+#endif
+    for (i=1; i<n; i++)
+        BIG_XXX_zero(x[i]);
+}
+
+/* compare x and y - must be normalised */
+int FF_WWW_comp(BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i,j;
+    for (i=n-1; i>=0; i--)
+    {
+        j=BIG_XXX_comp(x[i],y[i]);
+        if (j!=0) return j;
+    }
+    return 0;
+}
+
+/* recursive add */
+static void FF_WWW_radd(BIG_XXX z[],int zp,BIG_XXX x[],int xp,BIG_XXX y[],int yp,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_add(z[zp+i],x[xp+i],y[yp+i]);
+}
+
+/* recursive inc */
+static void FF_WWW_rinc(BIG_XXX z[],int zp,BIG_XXX y[],int yp,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_add(z[zp+i],z[zp+i],y[yp+i]);
+}
+
+/* recursive dec */
+static void FF_WWW_rdec(BIG_XXX z[],int zp,BIG_XXX y[],int yp,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_sub(z[zp+i],z[zp+i],y[yp+i]);
+}
+
+/* simple add */
+void FF_WWW_add(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_add(z[i],x[i],y[i]);
+}
+
+/* simple sub */
+void FF_WWW_sub(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_sub(z[i],x[i],y[i]);
+}
+
+/* increment/decrement by a small integer */
+void FF_WWW_inc(BIG_XXX x[],int m,int n)
+{
+    BIG_XXX_inc(x[0],m);
+    FF_WWW_norm(x,n);
+}
+
+void FF_WWW_dec(BIG_XXX x[],int m,int n)
+{
+    BIG_XXX_dec(x[0],m);
+    FF_WWW_norm(x,n);
+}
+
+/* normalise - but hold any overflow in top part unless n<0 */
+static void FF_WWW_rnorm(BIG_XXX z[],int zp,int n)
+{
+    int i,trunc=0;
+    chunk carry;
+    if (n<0)
+    {
+        /* -v n signals to do truncation */
+        n=-n;
+        trunc=1;
+    }
+    for (i=0; i<n-1; i++)
+    {
+        carry=BIG_XXX_norm(z[zp+i]);
+
+        z[zp+i][NLEN_XXX-1]^=carry<<P_TBITS_WWW; /* remove it */
+        z[zp+i+1][0]+=carry;
+    }
+    carry=BIG_XXX_norm(z[zp+n-1]);
+    if (trunc) z[zp+n-1][NLEN_XXX-1]^=carry<<P_TBITS_WWW;
+}
+
+void FF_WWW_norm(BIG_XXX z[],int n)
+{
+    FF_WWW_rnorm(z,0,n);
+}
+
+/* shift left by one bit */
+void FF_WWW_shl(BIG_XXX x[],int n)
+{
+    int i;
+    int carry,delay_carry=0;
+    for (i=0; i<n-1; i++)
+    {
+        carry=BIG_XXX_fshl(x[i],1);
+        x[i][0]|=delay_carry;
+        x[i][NLEN_XXX-1]^=(chunk)carry<<P_TBITS_WWW;
+        delay_carry=carry;
+    }
+    BIG_XXX_fshl(x[n-1],1);
+    x[n-1][0]|=delay_carry;
+}
+
+/* shift right by one bit */
+void FF_WWW_shr(BIG_XXX x[],int n)
+{
+    int i;
+    int carry;
+    for (i=n-1; i>0; i--)
+    {
+        carry=BIG_XXX_fshr(x[i],1);
+        x[i-1][NLEN_XXX-1]|=(chunk)carry<<P_TBITS_WWW;
+    }
+    BIG_XXX_fshr(x[0],1);
+}
+
+void FF_WWW_output(BIG_XXX x[],int n)
+{
+    int i;
+    FF_WWW_norm(x,n);
+    for (i=n-1; i>=0; i--)
+    {
+        BIG_XXX_output(x[i]);
+        printf(" ");
+    }
+}
+
+void FF_WWW_rawoutput(BIG_XXX x[],int n)
+{
+    int i;
+    for (i=n-1; i>=0; i--)
+    {
+        BIG_XXX_rawoutput(x[i]);
+        printf(" ");
+    }
+}
+
+/* Convert FFs to/from octet strings */
+void FF_WWW_toOctet(octet *w,BIG_XXX x[],int n)
+{
+    int i;
+    w->len=n*MODBYTES_XXX;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_toBytes(&(w->val[(n-i-1)*MODBYTES_XXX]),x[i]);
+    }
+}
+
+void FF_WWW_fromOctet(BIG_XXX x[],octet *w,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_fromBytes(x[i],&(w->val[(n-i-1)*MODBYTES_XXX]));
+    }
+}
+
+/* in-place swapping using xor - side channel resistant */
+static void FF_WWW_cswap(BIG_XXX a[],BIG_XXX b[],int d,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_cswap(a[i],b[i],d);
+    return;
+}
+
+/* z=x*y, t is workspace */
+static void FF_WWW_karmul(BIG_XXX z[],int zp,BIG_XXX x[],int xp,BIG_XXX y[],int yp,BIG_XXX t[],int tp,int n)
+{
+    int nd2;
+    if (n==1)
+    {
+        BIG_XXX_norm(x[xp]);
+        BIG_XXX_norm(y[yp]);
+        BIG_XXX_mul(t[tp],x[xp],y[yp]);
+        BIG_XXX_split(z[zp+1],z[zp],t[tp],BIGBITS_XXX);
+        return;
+    }
+
+    nd2=n/2;
+    FF_WWW_radd(z,zp,x,xp,x,xp+nd2,nd2);
+    FF_WWW_rnorm(z,zp,nd2);  /* needs this if recursion level too deep */
+
+    FF_WWW_radd(z,zp+nd2,y,yp,y,yp+nd2,nd2);
+    FF_WWW_rnorm(z,zp+nd2,nd2);
+    FF_WWW_karmul(t,tp,z,zp,z,zp+nd2,t,tp+n,nd2);
+    FF_WWW_karmul(z,zp,x,xp,y,yp,t,tp+n,nd2);
+    FF_WWW_karmul(z,zp+n,x,xp+nd2,y,yp+nd2,t,tp+n,nd2);
+    FF_WWW_rdec(t,tp,z,zp,n);
+    FF_WWW_rdec(t,tp,z,zp+n,n);
+    FF_WWW_rinc(z,zp+nd2,t,tp,n);
+    FF_WWW_rnorm(z,zp,2*n);
+}
+
+static void FF_WWW_karsqr(BIG_XXX z[],int zp,BIG_XXX x[],int xp,BIG_XXX t[],int tp,int n)
+{
+    int nd2;
+    if (n==1)
+    {
+        BIG_XXX_norm(x[xp]);
+        BIG_XXX_sqr(t[tp],x[xp]);
+        BIG_XXX_split(z[zp+1],z[zp],t[tp],BIGBITS_XXX);
+        return;
+    }
+    nd2=n/2;
+    FF_WWW_karsqr(z,zp,x,xp,t,tp+n,nd2);
+    FF_WWW_karsqr(z,zp+n,x,xp+nd2,t,tp+n,nd2);
+    FF_WWW_karmul(t,tp,x,xp,x,xp+nd2,t,tp+n,nd2);
+    FF_WWW_rinc(z,zp+nd2,t,tp,n);
+    FF_WWW_rinc(z,zp+nd2,t,tp,n);
+
+    FF_WWW_rnorm(z,zp+nd2,n);  /* was FF_rnorm(z,zp,2*n)  */
+}
+
+static void FF_WWW_karmul_lower(BIG_XXX z[],int zp,BIG_XXX x[],int xp,BIG_XXX y[],int yp,BIG_XXX t[],int tp,int n)
+{
+    /* Calculates Least Significant bottom half of x*y */
+    int nd2;
+    if (n==1)
+    {
+        /* only calculate bottom half of product */
+        BIG_XXX_norm(x[xp]);
+        BIG_XXX_norm(y[yp]);
+        BIG_XXX_smul(z[zp],x[xp],y[yp]);
+        return;
+    }
+    nd2=n/2;
+    FF_WWW_karmul(z,zp,x,xp,y,yp,t,tp+n,nd2);
+    FF_WWW_karmul_lower(t,tp,x,xp+nd2,y,yp,t,tp+n,nd2);
+    FF_WWW_rinc(z,zp+nd2,t,tp,nd2);
+    FF_WWW_karmul_lower(t,tp,x,xp,y,yp+nd2,t,tp+n,nd2);
+    FF_WWW_rinc(z,zp+nd2,t,tp,nd2);
+    FF_WWW_rnorm(z,zp+nd2,-nd2);  /* truncate it */
+}
+
+static void FF_WWW_karmul_upper(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],BIG_XXX t[],int n)
+{
+    /* Calculates Most Significant upper half of x*y, given lower part */
+    int nd2;
+
+    nd2=n/2;
+    FF_WWW_radd(z,n,x,0,x,nd2,nd2);
+    FF_WWW_radd(z,n+nd2,y,0,y,nd2,nd2);
+    FF_WWW_rnorm(z,n,nd2);
+    FF_WWW_rnorm(z,n+nd2,nd2);
+
+    FF_WWW_karmul(t,0,z,n+nd2,z,n,t,n,nd2);  /* t = (a0+a1)(b0+b1) */
+    FF_WWW_karmul(z,n,x,nd2,y,nd2,t,n,nd2); /* z[n]= a1*b1 */
+    /* z[0-nd2]=l(a0b0) z[nd2-n]= h(a0b0)+l(t)-l(a0b0)-l(a1b1) */
+    FF_WWW_rdec(t,0,z,n,n);              /* t=t-a1b1  */
+    FF_WWW_rinc(z,nd2,z,0,nd2);   /* z[nd2-n]+=l(a0b0) = h(a0b0)+l(t)-l(a1b1)  */
+    FF_WWW_rdec(z,nd2,t,0,nd2);   /* z[nd2-n]=h(a0b0)+l(t)-l(a1b1)-l(t-a1b1)=h(a0b0) */
+    FF_WWW_rnorm(z,0,-n);                    /* a0b0 now in z - truncate it */
+    FF_WWW_rdec(t,0,z,0,n);         /* (a0+a1)(b0+b1) - a0b0 */
+    FF_WWW_rinc(z,nd2,t,0,n);
+
+    FF_WWW_rnorm(z,nd2,n);
+}
+
+/* z=x*y */
+void FF_WWW_mul(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],int n)
+{
+#ifndef C99
+    BIG_XXX t[2*FFLEN_WWW];
+#else
+    BIG_XXX t[2*n];
+#endif
+    // FF_norm(x,n); /* change here */
+    // FF_norm(y,n); /* change here */
+    FF_WWW_karmul(z,0,x,0,y,0,t,0,n);
+}
+
+/* return low part of product */
+static void FF_WWW_lmul(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],int n)
+{
+#ifndef C99
+    BIG_XXX t[2*FFLEN_WWW];
+#else
+    BIG_XXX t[2*n];
+#endif
+    // FF_norm(x,n); /* change here */
+    // FF_norm(y,n); /* change here */
+    FF_WWW_karmul_lower(z,0,x,0,y,0,t,0,n);
+}
+
+/* Set b=b mod c */
+void FF_WWW_mod(BIG_XXX b[],BIG_XXX c[],int n)
+{
+    int k=0;
+
+    FF_WWW_norm(b,n);
+    if (FF_WWW_comp(b,c,n)<0)
+        return;
+    do
+    {
+        FF_WWW_shl(c,n);
+        k++;
+    }
+    while (FF_WWW_comp(b,c,n)>=0);
+
+    while (k>0)
+    {
+        FF_WWW_shr(c,n);
+        if (FF_WWW_comp(b,c,n)>=0)
+        {
+            FF_WWW_sub(b,b,c,n);
+            FF_WWW_norm(b,n);
+        }
+        k--;
+    }
+}
+
+/* z=x^2 */
+void FF_WWW_sqr(BIG_XXX z[],BIG_XXX x[],int n)
+{
+#ifndef C99
+    BIG_XXX t[2*FFLEN_WWW];
+#else
+    BIG_XXX t[2*n];
+#endif
+    // FF_norm(x,n);  /* change here */
+    FF_WWW_karsqr(z,0,x,0,t,0,n);
+}
+
+/* r=t mod modulus, N is modulus, ND is Montgomery Constant */
+static void FF_WWW_reduce(BIG_XXX r[],BIG_XXX T[],BIG_XXX N[],BIG_XXX ND[],int n)
+{
+    /* fast karatsuba Montgomery reduction */
+#ifndef C99
+    BIG_XXX t[2*FFLEN_WWW];
+    BIG_XXX m[FFLEN_WWW];
+#else
+    BIG_XXX t[2*n];
+    BIG_XXX m[n];
+#endif
+    FF_WWW_sducopy(r,T,n);  /* keep top half of T */
+    //FF_norm(T,n); /* change here */
+    FF_WWW_karmul_lower(m,0,T,0,ND,0,t,0,n);  /* m=T.(1/N) mod R */
+
+    //FF_norm(N,n);  /* change here */
+    FF_WWW_karmul_upper(T,N,m,t,n);  /* T=mN */
+    FF_WWW_sducopy(m,T,n);
+
+    FF_WWW_add(r,r,N,n);
+    FF_WWW_sub(r,r,m,n);
+    FF_WWW_norm(r,n);
+}
+
+
+/* Set r=a mod b */
+/* a is of length - 2*n */
+/* r,b is of length - n */
+void FF_WWW_dmod(BIG_XXX r[],BIG_XXX a[],BIG_XXX b[],int n)
+{
+    int k;
+#ifndef C99
+    BIG_XXX m[2*FFLEN_WWW];
+    BIG_XXX x[2*FFLEN_WWW];
+#else
+    BIG_XXX m[2*n];
+    BIG_XXX x[2*n];
+#endif
+    FF_WWW_copy(x,a,2*n);
+    FF_WWW_norm(x,2*n);
+    FF_WWW_dsucopy(m,b,n);
+    k=BIGBITS_XXX*n;
+
+    while (FF_WWW_comp(x,m,2*n)>=0)
+    {
+        FF_WWW_sub(x,x,m,2*n);
+        FF_WWW_norm(x,2*n);
+    }
+
+    while (k>0)
+    {
+        FF_WWW_shr(m,2*n);
+
+        if (FF_WWW_comp(x,m,2*n)>=0)
+        {
+            FF_WWW_sub(x,x,m,2*n);
+            FF_WWW_norm(x,2*n);
+        }
+
+        k--;
+    }
+    FF_WWW_copy(r,x,n);
+    FF_WWW_mod(r,b,n);
+}
+
+/* Set r=1/a mod p. Binary method - a<p on entry */
+
+void FF_WWW_invmodp(BIG_XXX r[],BIG_XXX a[],BIG_XXX p[],int n)
+{
+#ifndef C99
+    BIG_XXX u[FFLEN_WWW],v[FFLEN_WWW],x1[FFLEN_WWW],x2[FFLEN_WWW],t[FFLEN_WWW],one[FFLEN_WWW];
+#else
+    BIG_XXX u[n],v[n],x1[n],x2[n],t[n],one[n];
+#endif
+    FF_WWW_copy(u,a,n);
+    FF_WWW_copy(v,p,n);
+    FF_WWW_one(one,n);
+    FF_WWW_copy(x1,one,n);
+    FF_WWW_zero(x2,n);
+
+// reduce n in here as well!
+    while (FF_WWW_comp(u,one,n)!=0 && FF_WWW_comp(v,one,n)!=0)
+    {
+        while (FF_WWW_parity(u)==0)
+        {
+            FF_WWW_shr(u,n);
+            if (FF_WWW_parity(x1)!=0)
+            {
+                FF_WWW_add(x1,p,x1,n);
+                FF_WWW_norm(x1,n);
+            }
+            FF_WWW_shr(x1,n);
+        }
+        while (FF_WWW_parity(v)==0)
+        {
+            FF_WWW_shr(v,n);
+            if (FF_WWW_parity(x2)!=0)
+            {
+                FF_WWW_add(x2,p,x2,n);
+                FF_WWW_norm(x2,n);
+            }
+            FF_WWW_shr(x2,n);
+        }
+        if (FF_WWW_comp(u,v,n)>=0)
+        {
+
+            FF_WWW_sub(u,u,v,n);
+            FF_WWW_norm(u,n);
+            if (FF_WWW_comp(x1,x2,n)>=0) FF_WWW_sub(x1,x1,x2,n);
+            else
+            {
+                FF_WWW_sub(t,p,x2,n);
+                FF_WWW_add(x1,x1,t,n);
+            }
+            FF_WWW_norm(x1,n);
+        }
+        else
+        {
+            FF_WWW_sub(v,v,u,n);
+            FF_WWW_norm(v,n);
+            if (FF_WWW_comp(x2,x1,n)>=0) FF_WWW_sub(x2,x2,x1,n);
+            else
+            {
+                FF_WWW_sub(t,p,x1,n);
+                FF_WWW_add(x2,x2,t,n);
+            }
+            FF_WWW_norm(x2,n);
+        }
+    }
+    if (FF_WWW_comp(u,one,n)==0)
+        FF_WWW_copy(r,x1,n);
+    else
+        FF_WWW_copy(r,x2,n);
+}
+
+/* nesidue mod m */
+static void FF_WWW_nres(BIG_XXX a[],BIG_XXX m[],int n)
+{
+#ifndef C99
+    BIG_XXX d[2*FFLEN_WWW];
+#else
+    BIG_XXX d[2*n];
+#endif
+    if (n==1)
+    {
+        BIG_XXX_dscopy(d[0],a[0]);
+        BIG_XXX_dshl(d[0],NLEN_XXX*BASEBITS_XXX);
+        BIG_XXX_dmod(a[0],d[0],m[0]);
+    }
+    else
+    {
+        FF_WWW_dsucopy(d,a,n);
+        FF_WWW_dmod(a,d,m,n);
+    }
+}
+
+static void FF_WWW_redc(BIG_XXX a[],BIG_XXX m[],BIG_XXX ND[],int n)
+{
+#ifndef C99
+    BIG_XXX d[2*FFLEN_WWW];
+#else
+    BIG_XXX d[2*n];
+#endif
+    if (n==1)
+    {
+        BIG_XXX_dzero(d[0]);
+        BIG_XXX_dscopy(d[0],a[0]);
+        BIG_XXX_monty(a[0],m[0],((chunk)1<<BASEBITS_XXX)-ND[0][0],d[0]);
+    }
+    else
+    {
+        FF_WWW_mod(a,m,n);
+        FF_WWW_dscopy(d,a,n);
+        FF_WWW_reduce(a,d,m,ND,n);
+        FF_WWW_mod(a,m,n);
+    }
+}
+
+/* U=1/a mod 2^m - Arazi & Qi */
+static void FF_WWW_invmod2m(BIG_XXX U[],BIG_XXX a[],int n)
+{
+    int i;
+#ifndef C99
+    BIG_XXX t1[FFLEN_WWW],b[FFLEN_WWW],c[FFLEN_WWW];
+#else
+    BIG_XXX t1[2*n],b[n],c[n];
+#endif
+
+    FF_WWW_zero(U,n);
+    FF_WWW_zero(b,n);
+    FF_WWW_zero(c,n);
+    FF_WWW_zero(t1,2*n);
+
+    BIG_XXX_copy(U[0],a[0]);
+    BIG_XXX_invmod2m(U[0]);
+    for (i=1; i<n; i<<=1)
+    {
+        FF_WWW_copy(b,a,i);
+        FF_WWW_mul(t1,U,b,i);
+        FF_WWW_shrw(t1,i); // top half to bottom half, top half=0
+
+        FF_WWW_copy(c,a,2*i);
+        FF_WWW_shrw(c,i); // top half of c
+        FF_WWW_lmul(b,U,c,i); // should set top half of b=0
+        FF_WWW_add(t1,t1,b,i);
+        FF_WWW_norm(t1,2*i);
+        FF_WWW_lmul(b,t1,U,i);
+        FF_WWW_copy(t1,b,i);
+        FF_WWW_one(b,i);
+        FF_WWW_shlw(b,i);
+        FF_WWW_sub(t1,b,t1,2*i);
+        FF_WWW_norm(t1,2*i);
+        FF_WWW_shlw(t1,i);
+        FF_WWW_add(U,U,t1,2*i);
+    }
+
+    FF_WWW_norm(U,n);
+}
+
+void FF_WWW_random(BIG_XXX x[],csprng *rng,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        BIG_XXX_random(x[i],rng);
+    }
+    /* make sure top bit is 1 */
+    while (BIG_XXX_nbits(x[n-1])<MODBYTES_XXX*8) BIG_XXX_random(x[n-1],rng);
+}
+
+/* generate random x mod p */
+void FF_WWW_randomnum(BIG_XXX x[],BIG_XXX p[],csprng *rng,int n)
+{
+    int i;
+#ifndef C99
+    BIG_XXX d[2*FFLEN_WWW];
+#else
+    BIG_XXX d[2*n];
+#endif
+    for (i=0; i<2*n; i++)
+    {
+        BIG_XXX_random(d[i],rng);
+    }
+    FF_WWW_dmod(x,d,p,n);
+}
+
+static void FF_WWW_modmul(BIG_XXX z[],BIG_XXX x[],BIG_XXX y[],BIG_XXX p[],BIG_XXX ND[],int n)
+{
+#ifndef C99
+    BIG_XXX d[2*FFLEN_WWW];
+#else
+    BIG_XXX d[2*n];
+#endif
+    chunk ex=P_EXCESS_WWW(x[n-1]);
+    chunk ey=P_EXCESS_WWW(y[n-1]);
+#ifdef dchunk
+    if ((dchunk)(ex+1)*(ey+1)>(dchunk)P_FEXCESS_WWW)
+#else
+    if ((ex+1)>P_FEXCESS_WWW/(ey+1))
+#endif
+    {
+#ifdef DEBUG_REDUCE
+        printf("Product too large - reducing it %d %d\n",ex,ey);
+#endif
+        FF_WWW_mod(x,p,n);
+    }
+
+    if (n==1)
+    {
+        BIG_XXX_mul(d[0],x[0],y[0]);
+        BIG_XXX_monty(z[0],p[0],((chunk)1<<BASEBITS_XXX)-ND[0][0],d[0]);
+    }
+    else
+    {
+        FF_WWW_mul(d,x,y,n);
+        FF_WWW_reduce(z,d,p,ND,n);
+    }
+}
+
+static void FF_WWW_modsqr(BIG_XXX z[],BIG_XXX x[],BIG_XXX p[],BIG_XXX ND[],int n)
+{
+#ifndef C99
+    BIG_XXX d[2*FFLEN_WWW];
+#else
+    BIG_XXX d[2*n];
+#endif
+    chunk ex=P_EXCESS_WWW(x[n-1]);
+#ifdef dchunk
+    if ((dchunk)(ex+1)*(ex+1)>(dchunk)P_FEXCESS_WWW)
+#else
+    if ((ex+1)>P_FEXCESS_WWW/(ex+1))
+#endif
+    {
+#ifdef DEBUG_REDUCE
+        printf("Product too large - reducing it %d\n",ex);
+#endif
+        FF_WWW_mod(x,p,n);
+    }
+    if (n==1)
+    {
+        BIG_XXX_sqr(d[0],x[0]);
+        BIG_XXX_monty(z[0],p[0],((chunk)1<<BASEBITS_XXX)-ND[0][0],d[0]);
+    }
+    else
+    {
+        FF_WWW_sqr(d,x,n);
+        FF_WWW_reduce(z,d,p,ND,n);
+    }
+}
+
+/* r=x^e mod p using side-channel resistant Montgomery Ladder, for large e */
+void FF_WWW_skpow(BIG_XXX r[],BIG_XXX x[],BIG_XXX e[],BIG_XXX p[],int n)
+{
+    int i,b;
+#ifndef C99
+    BIG_XXX R0[FFLEN_WWW],R1[FFLEN_WWW],ND[FFLEN_WWW];
+#else
+    BIG_XXX R0[n],R1[n],ND[n];
+#endif
+    FF_WWW_invmod2m(ND,p,n);
+
+    FF_WWW_one(R0,n);
+    FF_WWW_copy(R1,x,n);
+    FF_WWW_nres(R0,p,n);
+    FF_WWW_nres(R1,p,n);
+
+    for (i=8*MODBYTES_XXX*n-1; i>=0; i--)
+    {
+        b=BIG_XXX_bit(e[i/BIGBITS_XXX],i%BIGBITS_XXX);
+        FF_WWW_modmul(r,R0,R1,p,ND,n);
+
+        FF_WWW_cswap(R0,R1,b,n);
+        FF_WWW_modsqr(R0,R0,p,ND,n);
+
+        FF_WWW_copy(R1,r,n);
+        FF_WWW_cswap(R0,R1,b,n);
+    }
+    FF_WWW_copy(r,R0,n);
+    FF_WWW_redc(r,p,ND,n);
+}
+
+/* r=x^e mod p using side-channel resistant Montgomery Ladder, for short e */
+void FF_WWW_skspow(BIG_XXX r[],BIG_XXX x[],BIG_XXX e,BIG_XXX p[],int n)
+{
+    int i,b;
+#ifndef C99
+    BIG_XXX R0[FFLEN_WWW],R1[FFLEN_WWW],ND[FFLEN_WWW];
+#else
+    BIG_XXX R0[n],R1[n],ND[n];
+#endif
+    FF_WWW_invmod2m(ND,p,n);
+    FF_WWW_one(R0,n);
+    FF_WWW_copy(R1,x,n);
+    FF_WWW_nres(R0,p,n);
+    FF_WWW_nres(R1,p,n);
+    for (i=8*MODBYTES_XXX-1; i>=0; i--)
+    {
+        b=BIG_XXX_bit(e,i);
+        FF_WWW_modmul(r,R0,R1,p,ND,n);
+        FF_WWW_cswap(R0,R1,b,n);
+        FF_WWW_modsqr(R0,R0,p,ND,n);
+        FF_WWW_copy(R1,r,n);
+        FF_WWW_cswap(R0,R1,b,n);
+    }
+    FF_WWW_copy(r,R0,n);
+    FF_WWW_redc(r,p,ND,n);
+}
+
+/* raise to an integer power - right-to-left method */
+void FF_WWW_power(BIG_XXX r[],BIG_XXX x[],int e,BIG_XXX p[],int n)
+{
+    int f=1;
+#ifndef C99
+    BIG_XXX w[FFLEN_WWW],ND[FFLEN_WWW];
+#else
+    BIG_XXX w[n],ND[n];
+#endif
+    FF_WWW_invmod2m(ND,p,n);
+
+    FF_WWW_copy(w,x,n);
+    FF_WWW_nres(w,p,n);
+
+    if (e==2)
+    {
+        FF_WWW_modsqr(r,w,p,ND,n);
+    }
+    else for (;;)
+        {
+            if (e%2==1)
+            {
+                if (f) FF_WWW_copy(r,w,n);
+                else FF_WWW_modmul(r,r,w,p,ND,n);
+                f=0;
+            }
+            e>>=1;
+            if (e==0) break;
+            FF_WWW_modsqr(w,w,p,ND,n);
+        }
+
+    FF_WWW_redc(r,p,ND,n);
+}
+
+/* r=x^e mod p, faster but not side channel resistant */
+void FF_WWW_pow(BIG_XXX r[],BIG_XXX x[],BIG_XXX e[],BIG_XXX p[],int n)
+{
+    int i,b;
+#ifndef C99
+    BIG_XXX w[FFLEN_WWW],ND[FFLEN_WWW];
+#else
+    BIG_XXX w[n],ND[n];
+#endif
+    FF_WWW_invmod2m(ND,p,n);
+
+    FF_WWW_copy(w,x,n);
+    FF_WWW_one(r,n);
+    FF_WWW_nres(r,p,n);
+    FF_WWW_nres(w,p,n);
+
+    for (i=8*MODBYTES_XXX*n-1; i>=0; i--)
+    {
+        FF_WWW_modsqr(r,r,p,ND,n);
+        b=BIG_XXX_bit(e[i/BIGBITS_XXX],i%BIGBITS_XXX);
+        if (b==1) FF_WWW_modmul(r,r,w,p,ND,n);
+    }
+    FF_WWW_redc(r,p,ND,n);
+}
+
+/* double exponentiation r=x^e.y^f mod p */
+void FF_WWW_pow2(BIG_XXX r[],BIG_XXX x[],BIG_XXX e,BIG_XXX y[],BIG_XXX f,BIG_XXX p[],int n)
+{
+    int i,eb,fb;
+#ifndef C99
+    BIG_XXX xn[FFLEN_WWW],yn[FFLEN_WWW],xy[FFLEN_WWW],ND[FFLEN_WWW];
+#else
+    BIG_XXX xn[n],yn[n],xy[n],ND[n];
+#endif
+
+    FF_WWW_invmod2m(ND,p,n);
+
+    FF_WWW_copy(xn,x,n);
+    FF_WWW_copy(yn,y,n);
+    FF_WWW_nres(xn,p,n);
+    FF_WWW_nres(yn,p,n);
+    FF_WWW_modmul(xy,xn,yn,p,ND,n);
+    FF_WWW_one(r,n);
+    FF_WWW_nres(r,p,n);
+
+    for (i=8*MODBYTES_XXX-1; i>=0; i--)
+    {
+        eb=BIG_XXX_bit(e,i);
+        fb=BIG_XXX_bit(f,i);
+        FF_WWW_modsqr(r,r,p,ND,n);
+        if (eb==1)
+        {
+            if (fb==1) FF_WWW_modmul(r,r,xy,p,ND,n);
+            else FF_WWW_modmul(r,r,xn,p,ND,n);
+        }
+        else
+        {
+            if (fb==1) FF_WWW_modmul(r,r,yn,p,ND,n);
+        }
+    }
+    FF_WWW_redc(r,p,ND,n);
+}
+
+static sign32 igcd(sign32 x,sign32 y)
+{
+    /* integer GCD, returns GCD of x and y */
+    sign32 r;
+    if (y==0) return x;
+    while ((r=x%y)!=0)
+        x=y,y=r;
+    return y;
+}
+
+/* quick and dirty check for common factor with s */
+int FF_WWW_cfactor(BIG_XXX w[],sign32 s,int n)
+{
+    int r;
+    sign32 g;
+#ifndef C99
+    BIG_XXX x[FFLEN_WWW],y[FFLEN_WWW];
+#else
+    BIG_XXX x[n],y[n];
+#endif
+    FF_WWW_init(y,s,n);
+    FF_WWW_copy(x,w,n);
+    FF_WWW_norm(x,n);
+
+    do
+    {
+        FF_WWW_sub(x,x,y,n);
+        FF_WWW_norm(x,n);
+        while (!FF_WWW_iszilch(x,n) && FF_WWW_parity(x)==0) FF_WWW_shr(x,n);
+    }
+    while (FF_WWW_comp(x,y,n)>0);
+#if CHUNK<32
+    g=x[0][0]+((sign32)(x[0][1])<<BASEBITS_XXX);
+#else
+    g=(sign32)x[0][0];
+#endif
+    r=igcd(s,g);
+    if (r>1) return 1;
+    return 0;
+}
+
+/* Miller-Rabin test for primality. Slow. */
+int FF_WWW_prime(BIG_XXX p[],csprng *rng,int n)
+{
+    int i,j,loop,s=0;
+#ifndef C99
+    BIG_XXX d[FFLEN_WWW],x[FFLEN_WWW],unity[FFLEN_WWW],nm1[FFLEN_WWW];
+#else
+    BIG_XXX d[n],x[n],unity[n],nm1[n];
+#endif
+    sign32 sf=4849845;/* 3*5*.. *19 */
+
+    FF_WWW_norm(p,n);
+
+    if (FF_WWW_cfactor(p,sf,n)) return 0;
+
+    FF_WWW_one(unity,n);
+    FF_WWW_sub(nm1,p,unity,n);
+    FF_WWW_norm(nm1,n);
+    FF_WWW_copy(d,nm1,n);
+    while (FF_WWW_parity(d)==0)
+    {
+        FF_WWW_shr(d,n);
+        s++;
+    }
+    if (s==0) return 0;
+
+    for (i=0; i<10; i++)
+    {
+        FF_WWW_randomnum(x,p,rng,n);
+        FF_WWW_pow(x,x,d,p,n);
+        if (FF_WWW_comp(x,unity,n)==0 || FF_WWW_comp(x,nm1,n)==0) continue;
+        loop=0;
+        for (j=1; j<s; j++)
+        {
+            FF_WWW_power(x,x,2,p,n);
+            if (FF_WWW_comp(x,unity,n)==0) return 0;
+            if (FF_WWW_comp(x,nm1,n)==0 )
+            {
+                loop=1;
+                break;
+            }
+        }
+        if (loop) continue;
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+BIG P[4]= {{0x1670957,0x1568CD3C,0x2595E5,0xEED4F38,0x1FC9A971,0x14EF7E62,0xA503883,0x9E1E05E,0xBF59E3},{0x1844C908,0x1B44A798,0x3A0B1E7,0xD1B5B4E,0x1836046F,0x87E94F9,0x1D34C537,0xF7183B0,0x46D07},{0x17813331,0x19E28A90,0x1473A4D6,0x1CACD01F,0x1EEA8838,0xAF2AE29,0x1F85292A,0x1632585E,0xD945E5},{0x919F5EF,0x1567B39F,0x19F6AD11,0x16CE47CF,0x9B36EB1,0x35B7D3,0x483B28C,0xCBEFA27,0xB5FC21}};
+
+int main()
+{
+	int i;
+	BIG p[4],e[4],x[4],r[4];
+	csprng rng;
+	char raw[100];
+	for (i=0;i<100;i++) raw[i]=i;
+    RAND_seed(&rng,100,raw);
+
+
+	FF_init(x,3,4);
+
+	FF_copy(p,P,4);
+	FF_copy(e,p,4);
+	FF_dec(e,1,4);
+	FF_norm(e,4);
+
+
+
+	printf("p= ");FF_output(p,4); printf("\n");
+	if (FF_prime(p,&rng,4)) printf("p is a prime\n");
+	printf("e= ");FF_output(e,4); printf("\n");
+
+	FF_skpow(r,x,e,p,4);
+	printf("r= ");FF_output(r,4); printf("\n");
+}
+
+*/

http://git-wip-us.apache.org/repos/asf/incubator-milagro-crypto-c/blob/8d28d2c3/src/fp.c.in
----------------------------------------------------------------------
diff --git a/src/fp.c.in b/src/fp.c.in
new file mode 100644
index 0000000..9144fcf
--- /dev/null
+++ b/src/fp.c.in
@@ -0,0 +1,650 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/
+
+/* AMCL mod p functions */
+/* Small Finite Field arithmetic */
+/* SU=m, SU is Stack Usage (NOT_SPECIAL Modulus) */
+
+#include "fp_YYY.h"
+
+/* Fast Modular Reduction Methods */
+
+/* r=d mod m */
+/* d MUST be normalised */
+/* Products must be less than pR in all cases !!! */
+/* So when multiplying two numbers, their product *must* be less than MODBITS+BASEBITS*NLEN */
+/* Results *may* be one bit bigger than MODBITS */
+
+#if MODTYPE_YYY == PSEUDO_MERSENNE
+/* r=d mod m */
+
+/* Converts from BIG integer to residue form mod Modulus */
+void FP_YYY_nres(FP_YYY *y,BIG_XXX x)
+{
+    BIG_XXX_copy(y->g,x);
+    y->XES=1;
+}
+
+/* Converts from residue form back to BIG integer form */
+void FP_YYY_redc(BIG_XXX x,FP_YYY *y)
+{
+    BIG_XXX_copy(x,y->g);
+}
+
+/* reduce a DBIG to a BIG exploiting the special form of the modulus */
+void FP_YYY_mod(BIG_XXX r,DBIG_XXX d)
+{
+    BIG_XXX t,b;
+    chunk v,tw;
+    BIG_XXX_split(t,b,d,MODBITS_YYY);
+
+    /* Note that all of the excess gets pushed into t. So if squaring a value with a 4-bit excess, this results in
+       t getting all 8 bits of the excess product! So products must be less than pR which is Montgomery compatible */
+
+    if (MConst_YYY < NEXCESS_XXX)
+    {
+        BIG_XXX_imul(t,t,MConst_YYY);
+        BIG_XXX_norm(t);
+        BIG_XXX_add(r,t,b);
+        BIG_XXX_norm(r);
+        tw=r[NLEN_XXX-1];
+        r[NLEN_XXX-1]&=TMASK_YYY;
+        r[0]+=MConst_YYY*((tw>>TBITS_YYY));
+    }
+    else
+    {
+        v=BIG_XXX_pmul(t,t,MConst_YYY);
+        BIG_XXX_add(r,t,b);
+        BIG_XXX_norm(r);
+        tw=r[NLEN_XXX-1];
+        r[NLEN_XXX-1]&=TMASK_YYY;
+#if CHUNK == 16
+        r[1]+=muladd_XXX(MConst_YYY,((tw>>TBITS_YYY)+(v<<(BASEBITS_XXX-TBITS_YYY))),0,&r[0]);
+#else
+        r[0]+=MConst_YYY*((tw>>TBITS_YYY)+(v<<(BASEBITS_XXX-TBITS_YYY)));
+#endif
+    }
+    BIG_XXX_norm(r);
+}
+#endif
+
+/* This only applies to Curve C448, so specialised (for now) */
+#if MODTYPE_YYY == GENERALISED_MERSENNE
+
+void FP_YYY_nres(FP_YYY *y,BIG_XXX x)
+{
+    BIG_XXX_copy(y->g,x);
+    y->XES=1;
+}
+
+/* Converts from residue form back to BIG integer form */
+void FP_YYY_redc(BIG_XXX x,FP_YYY *y)
+{
+    BIG_XXX_copy(x,y->g);
+}
+
+/* reduce a DBIG to a BIG exploiting the special form of the modulus */
+void FP_YYY_mod(BIG_XXX r,DBIG_XXX d)
+{
+    BIG_XXX t,b;
+    chunk carry;
+    BIG_XXX_split(t,b,d,MBITS_YYY);
+
+    BIG_XXX_add(r,t,b);
+
+    BIG_XXX_dscopy(d,t);
+    BIG_XXX_dshl(d,MBITS_YYY/2);
+
+    BIG_XXX_split(t,b,d,MBITS_YYY);
+
+    BIG_XXX_add(r,r,t);
+    BIG_XXX_add(r,r,b);
+    BIG_XXX_norm(r);
+    BIG_XXX_shl(t,MBITS_YYY/2);
+
+    BIG_XXX_add(r,r,t);
+
+    carry=r[NLEN_XXX-1]>>TBITS_YYY;
+
+    r[NLEN_XXX-1]&=TMASK_YYY;
+    r[0]+=carry;
+
+    r[224/BASEBITS_XXX]+=carry<<(224%BASEBITS_XXX); /* need to check that this falls mid-word */
+    BIG_XXX_norm(r);
+}
+
+#endif
+
+#if MODTYPE_YYY == MONTGOMERY_FRIENDLY
+
+/* convert to Montgomery n-residue form */
+void FP_YYY_nres(FP_YYY *y,BIG_XXX x)
+{
+    DBIG_XXX d;
+    BIG_XXX r;
+    BIG_XXX_rcopy(r,R2modp_YYY);
+    BIG_XXX_mul(d,x,r);
+    FP_YYY_mod(y->g,d);
+    y->XES=2;
+}
+
+/* convert back to regular form */
+void FP_YYY_redc(BIG_XXX x,FP_YYY *y)
+{
+    DBIG_XXX d;
+    BIG_XXX_dzero(d);
+    BIG_XXX_dscopy(d,y->g);
+    FP_YYY_mod(x,d);
+}
+
+/* fast modular reduction from DBIG to BIG exploiting special form of the modulus */
+void FP_YYY_mod(BIG_XXX a,DBIG_XXX d)
+{
+    int i;
+
+    for (i=0; i<NLEN_XXX; i++)
+        d[NLEN_XXX+i]+=muladd_XXX(d[i],MConst_YYY-1,d[i],&d[NLEN_XXX+i-1]);
+
+    BIG_XXX_sducopy(a,d);
+    BIG_XXX_norm(a);
+}
+
+#endif
+
+#if MODTYPE_YYY == NOT_SPECIAL
+
+/* convert to Montgomery n-residue form */
+void FP_YYY_nres(FP_YYY *y,BIG_XXX x)
+{
+    DBIG_XXX d;
+    BIG_XXX r;
+    BIG_XXX_rcopy(r,R2modp_YYY);
+    BIG_XXX_mul(d,x,r);
+    FP_YYY_mod(y->g,d);
+    y->XES=2;
+}
+
+/* convert back to regular form */
+void FP_YYY_redc(BIG_XXX x,FP_YYY *y)
+{
+    DBIG_XXX d;
+    BIG_XXX_dzero(d);
+    BIG_XXX_dscopy(d,y->g);
+    FP_YYY_mod(x,d);
+}
+
+
+/* reduce a DBIG to a BIG using Montgomery's no trial division method */
+/* d is expected to be dnormed before entry */
+/* SU= 112 */
+void FP_YYY_mod(BIG_XXX a,DBIG_XXX d)
+{
+    BIG_XXX mdls;
+    BIG_XXX_rcopy(mdls,Modulus_YYY);
+    BIG_XXX_monty(a,mdls,MConst_YYY,d);
+}
+
+#endif
+
+/* test x==0 ? */
+/* SU= 48 */
+int FP_YYY_iszilch(FP_YYY *x)
+{
+    BIG_XXX m;
+    BIG_XXX_rcopy(m,Modulus_YYY);
+    BIG_XXX_mod(x->g,m);
+    return BIG_XXX_iszilch(x->g);
+}
+
+void FP_YYY_copy(FP_YYY *y,FP_YYY *x)
+{
+    BIG_XXX_copy(y->g,x->g);
+    y->XES=x->XES;
+}
+
+void FP_YYY_rcopy(FP_YYY *y, const BIG_XXX c)
+{
+    BIG_XXX b;
+    BIG_XXX_rcopy(b,c);
+    FP_YYY_nres(y,b);
+}
+
+/* Swap a and b if d=1 */
+void FP_YYY_cswap(FP_YYY *a,FP_YYY *b,int d)
+{
+    sign32 t,c=d;
+    BIG_XXX_cswap(a->g,b->g,d);
+
+    c=~(c-1);
+    t=c&((a->XES)^(b->XES));
+    a->XES^=t;
+    b->XES^=t;
+
+}
+
+/* Move b to a if d=1 */
+void FP_YYY_cmove(FP_YYY *a,FP_YYY *b,int d)
+{
+    sign32 c=-d;
+
+    BIG_XXX_cmove(a->g,b->g,d);
+    a->XES^=(a->XES^b->XES)&c;
+}
+
+void FP_YYY_zero(FP_YYY *x)
+{
+    BIG_XXX_zero(x->g);
+    x->XES=1;
+}
+
+int FP_YYY_equals(FP_YYY *x,FP_YYY *y)
+{
+    FP_YYY_reduce(x);
+    FP_YYY_reduce(y);
+    if (BIG_XXX_comp(x->g,y->g)==0) return 1;
+    return 0;
+}
+
+/* output FP */
+/* SU= 48 */
+void FP_YYY_output(FP_YYY *r)
+{
+    BIG_XXX c;
+    FP_YYY_redc(c,r);
+    BIG_XXX_output(c);
+}
+
+void FP_YYY_rawoutput(FP_YYY *r)
+{
+    BIG_XXX_rawoutput(r->g);
+}
+
+#ifdef GET_STATS
+int tsqr=0,rsqr=0,tmul=0,rmul=0;
+int tadd=0,radd=0,tneg=0,rneg=0;
+int tdadd=0,rdadd=0,tdneg=0,rdneg=0;
+#endif
+
+#ifdef FUSED_MODMUL
+
+/* Insert fastest code here */
+
+#endif
+
+/* r=a*b mod Modulus */
+/* product must be less that p.R - and we need to know this in advance! */
+/* SU= 88 */
+void FP_YYY_mul(FP_YYY *r,FP_YYY *a,FP_YYY *b)
+{
+    DBIG_XXX d;
+
+    if ((sign64)a->XES*b->XES>(sign64)FEXCESS_YYY)
+    {
+#ifdef DEBUG_REDUCE
+        printf("Product too large - reducing it\n");
+#endif
+        FP_YYY_reduce(a);  /* it is sufficient to fully reduce just one of them < p */
+    }
+
+#ifdef FUSED_MODMUL
+    FP_YYY_modmul(r->g,a->g,b->g);
+#else
+    BIG_XXX_mul(d,a->g,b->g);
+    FP_YYY_mod(r->g,d);
+#endif
+    r->XES=2;
+}
+
+
+/* multiplication by an integer, r=a*c */
+/* SU= 136 */
+void FP_YYY_imul(FP_YYY *r,FP_YYY *a,int c)
+{
+    int s=0;
+
+    if (c<0)
+    {
+        c=-c;
+        s=1;
+    }
+
+#if MODTYPE_YYY==PSEUDO_MERSENNE || MODTYPE_YYY==GENERALISED_MERSENNE
+    DBIG_XXX d;
+    BIG_XXX_pxmul(d,a->g,c);
+    FP_YYY_mod(r->g,d);
+    r->XES=2;
+
+#else
+    //Montgomery
+    BIG_XXX k;
+    FP_YYY f;
+    if (a->XES*c<=FEXCESS_YYY)
+    {
+        BIG_XXX_pmul(r->g,a->g,c);
+        r->XES=a->XES*c;    // careful here - XES jumps!
+    }
+    else
+    {
+        // don't want to do this - only a problem for Montgomery modulus and larger constants
+        BIG_XXX_zero(k);
+        BIG_XXX_inc(k,c);
+        BIG_XXX_norm(k);
+        FP_YYY_nres(&f,k);
+        FP_YYY_mul(r,a,&f);
+    }
+#endif
+    if (s)
+    {
+        FP_YYY_neg(r,r);
+        FP_YYY_norm(r);
+    }
+}
+
+/* Set r=a^2 mod m */
+/* SU= 88 */
+void FP_YYY_sqr(FP_YYY *r,FP_YYY *a)
+{
+    DBIG_XXX d;
+
+    if ((sign64)a->XES*a->XES>(sign64)FEXCESS_YYY)
+    {
+#ifdef DEBUG_REDUCE
+        printf("Product too large - reducing it\n");
+#endif
+        FP_YYY_reduce(a);
+    }
+
+    BIG_XXX_sqr(d,a->g);
+    FP_YYY_mod(r->g,d);
+    r->XES=2;
+}
+
+/* SU= 16 */
+/* Set r=a+b */
+void FP_YYY_add(FP_YYY *r,FP_YYY *a,FP_YYY *b)
+{
+    BIG_XXX_add(r->g,a->g,b->g);
+    r->XES=a->XES+b->XES;
+    if (r->XES>FEXCESS_YYY)
+    {
+#ifdef DEBUG_REDUCE
+        printf("Sum too large - reducing it \n");
+#endif
+        FP_YYY_reduce(r);
+    }
+}
+
+/* Set r=a-b mod m */
+/* SU= 56 */
+void FP_YYY_sub(FP_YYY *r,FP_YYY *a,FP_YYY *b)
+{
+    FP_YYY n;
+    FP_YYY_neg(&n,b);
+    FP_YYY_add(r,a,&n);
+}
+
+/* SU= 48 */
+/* Fully reduce a mod Modulus */
+void FP_YYY_reduce(FP_YYY *a)
+{
+    BIG_XXX m;
+    BIG_XXX_rcopy(m,Modulus_YYY);
+    BIG_XXX_mod(a->g,m);
+    a->XES=1;
+}
+
+void FP_YYY_norm(FP_YYY *x)
+{
+    BIG_XXX_norm(x->g);
+}
+
+// https://graphics.stanford.edu/~seander/bithacks.html
+// constant time log to base 2 (or number of bits in)
+
+static int logb2(unsign32 v)
+{
+    int r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+
+    v = v - ((v >> 1) & 0x55555555);
+    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+    r = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+    return r;
+}
+
+/* Set r=-a mod Modulus */
+/* SU= 64 */
+void FP_YYY_neg(FP_YYY *r,FP_YYY *a)
+{
+    int sb;
+    BIG_XXX m;
+
+    BIG_XXX_rcopy(m,Modulus_YYY);
+
+    sb=logb2(a->XES-1);
+    BIG_XXX_fshl(m,sb);
+    BIG_XXX_sub(r->g,m,a->g);
+    r->XES=((sign32)1<<sb);
+
+    if (r->XES>FEXCESS_YYY)
+    {
+#ifdef DEBUG_REDUCE
+        printf("Negation too large -  reducing it \n");
+#endif
+        FP_YYY_reduce(r);
+    }
+
+}
+
+/* Set r=a/2. */
+/* SU= 56 */
+void FP_YYY_div2(FP_YYY *r,FP_YYY *a)
+{
+    BIG_XXX m;
+    BIG_XXX_rcopy(m,Modulus_YYY);
+    FP_YYY_copy(r,a);
+    if (BIG_XXX_parity(a->g)==0)
+    {
+
+        BIG_XXX_fshr(r->g,1);
+    }
+    else
+    {
+        BIG_XXX_add(r->g,r->g,m);
+        BIG_XXX_norm(r->g);
+        BIG_XXX_fshr(r->g,1);
+    }
+}
+
+/* set w=1/x */
+void FP_YYY_inv(FP_YYY *w,FP_YYY *x)
+{
+    BIG_XXX m2;
+    BIG_XXX_rcopy(m2,Modulus_YYY);
+    BIG_XXX_dec(m2,2);
+    BIG_XXX_norm(m2);
+    FP_YYY_pow(w,x,m2);
+}
+
+/* SU=8 */
+/* set n=1 */
+void FP_YYY_one(FP_YYY *n)
+{
+    BIG_XXX b;
+    BIG_XXX_one(b);
+    FP_YYY_nres(n,b);
+}
+
+/* Set r=a^b mod Modulus */
+/* SU= 136 */
+void FP_YYY_pow(FP_YYY *r,FP_YYY *a,BIG_XXX b)
+{
+    sign8 w[1+(NLEN_XXX*BASEBITS_XXX+3)/4];
+    FP_YYY tb[16];
+    BIG_XXX t;
+    int i,nb;
+
+    FP_YYY_norm(a);
+    BIG_XXX_norm(b);
+    BIG_XXX_copy(t,b);
+    nb=1+(BIG_XXX_nbits(t)+3)/4;
+    /* convert exponent to 4-bit window */
+    for (i=0; i<nb; i++)
+    {
+        w[i]=BIG_XXX_lastbits(t,4);
+        BIG_XXX_dec(t,w[i]);
+        BIG_XXX_norm(t);
+        BIG_XXX_fshr(t,4);
+    }
+
+    FP_YYY_one(&tb[0]);
+    FP_YYY_copy(&tb[1],a);
+    for (i=2; i<16; i++)
+        FP_YYY_mul(&tb[i],&tb[i-1],a);
+
+    FP_YYY_copy(r,&tb[w[nb-1]]);
+    for (i=nb-2; i>=0; i--)
+    {
+        FP_YYY_sqr(r,r);
+        FP_YYY_sqr(r,r);
+        FP_YYY_sqr(r,r);
+        FP_YYY_sqr(r,r);
+        FP_YYY_mul(r,r,&tb[w[i]]);
+    }
+    FP_YYY_reduce(r);
+}
+
+/* is r a QR? */
+int FP_YYY_qr(FP_YYY *r)
+{
+    int j;
+    BIG_XXX m;
+    BIG_XXX b;
+    BIG_XXX_rcopy(m,Modulus_YYY);
+    FP_YYY_redc(b,r);
+    j=BIG_XXX_jacobi(b,m);
+    FP_YYY_nres(r,b);
+    if (j==1) return 1;
+    return 0;
+
+}
+
+/* Set a=sqrt(b) mod Modulus */
+/* SU= 160 */
+void FP_YYY_sqrt(FP_YYY *r,FP_YYY *a)
+{
+    FP_YYY v,i;
+    BIG_XXX b;
+    BIG_XXX m;
+    BIG_XXX_rcopy(m,Modulus_YYY);
+    BIG_XXX_mod(a->g,m);
+    BIG_XXX_copy(b,m);
+    if (MOD8_YYY==5)
+    {
+        BIG_XXX_dec(b,5);
+        BIG_XXX_norm(b);
+        BIG_XXX_fshr(b,3); /* (p-5)/8 */
+        FP_YYY_copy(&i,a);
+        BIG_XXX_fshl(i.g,1);
+        FP_YYY_pow(&v,&i,b);
+        FP_YYY_mul(&i,&i,&v);
+        FP_YYY_mul(&i,&i,&v);
+        BIG_XXX_dec(i.g,1);
+        FP_YYY_mul(r,a,&v);
+        FP_YYY_mul(r,r,&i);
+        FP_YYY_reduce(r);
+    }
+    if (MOD8_YYY==3 || MOD8_YYY==7)
+    {
+        BIG_XXX_inc(b,1);
+        BIG_XXX_norm(b);
+        BIG_XXX_fshr(b,2); /* (p+1)/4 */
+        FP_YYY_pow(r,a,b);
+    }
+}
+
+/*
+int main()
+{
+
+	BIG_XXX r;
+
+	FP_YYY_one(r);
+	FP_YYY_sqr(r,r);
+
+	BIG_XXX_output(r);
+
+	int i,carry;
+	DBIG_XXX c={0,0,0,0,0,0,0,0};
+	BIG_XXX a={1,2,3,4};
+	BIG_XXX b={3,4,5,6};
+	BIG_XXX r={11,12,13,14};
+	BIG_XXX s={23,24,25,15};
+	BIG_XXX w;
+
+//	printf("NEXCESS_XXX= %d\n",NEXCESS_XXX);
+//	printf("MConst_YYY= %d\n",MConst_YYY);
+
+	BIG_XXX_copy(b,Modulus_YYY);
+	BIG_XXX_dec(b,1);
+	BIG_XXX_norm(b);
+
+	BIG_XXX_randomnum(r); BIG_XXX_norm(r); BIG_XXX_mod(r,Modulus_YYY);
+//	BIG_XXX_randomnum(s); norm(s); BIG_XXX_mod(s,Modulus_YYY);
+
+//	BIG_XXX_output(r);
+//	BIG_XXX_output(s);
+
+	BIG_XXX_output(r);
+	FP_YYY_nres(r);
+	BIG_XXX_output(r);
+	BIG_XXX_copy(a,r);
+	FP_YYY_redc(r);
+	BIG_XXX_output(r);
+	BIG_XXX_dscopy(c,a);
+	FP_YYY_mod(r,c);
+	BIG_XXX_output(r);
+
+
+//	exit(0);
+
+//	copy(r,a);
+	printf("r=   "); BIG_XXX_output(r);
+	BIG_XXX_modsqr(r,r,Modulus_YYY);
+	printf("r^2= "); BIG_XXX_output(r);
+
+	FP_YYY_nres(r);
+	FP_YYY_sqrt(r,r);
+	FP_YYY_redc(r);
+	printf("r=   "); BIG_XXX_output(r);
+	BIG_XXX_modsqr(r,r,Modulus_YYY);
+	printf("r^2= "); BIG_XXX_output(r);
+
+
+//	for (i=0;i<100000;i++) FP_YYY_sqr(r,r);
+//	for (i=0;i<100000;i++)
+		FP_YYY_sqrt(r,r);
+
+	BIG_XXX_output(r);
+}
+*/

http://git-wip-us.apache.org/repos/asf/incubator-milagro-crypto-c/blob/8d28d2c3/src/fp12.c.in
----------------------------------------------------------------------
diff --git a/src/fp12.c.in b/src/fp12.c.in
new file mode 100644
index 0000000..67c6242
--- /dev/null
+++ b/src/fp12.c.in
@@ -0,0 +1,870 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/
+
+/* AMCL Fp^12 functions */
+/* SU=m, m is Stack Usage (no lazy )*/
+/* FP12 elements are of the form a+i.b+i^2.c */
+
+#include "fp12_YYY.h"
+
+/* return 1 if b==c, no branching */
+static int teq(sign32 b,sign32 c)
+{
+    sign32 x=b^c;
+    x-=1;  // if x=0, x now -1
+    return (int)((x>>31)&1);
+}
+
+
+/* Constant time select from pre-computed table */
+static void FP12_YYY_select(FP12_YYY *f,FP12_YYY g[],sign32 b)
+{
+    FP12_YYY invf;
+    sign32 m=b>>31;
+    sign32 babs=(b^m)-m;
+
+    babs=(babs-1)/2;
+
+    FP12_YYY_cmove(f,&g[0],teq(babs,0));  // conditional move
+    FP12_YYY_cmove(f,&g[1],teq(babs,1));
+    FP12_YYY_cmove(f,&g[2],teq(babs,2));
+    FP12_YYY_cmove(f,&g[3],teq(babs,3));
+    FP12_YYY_cmove(f,&g[4],teq(babs,4));
+    FP12_YYY_cmove(f,&g[5],teq(babs,5));
+    FP12_YYY_cmove(f,&g[6],teq(babs,6));
+    FP12_YYY_cmove(f,&g[7],teq(babs,7));
+
+    FP12_YYY_copy(&invf,f);
+    FP12_YYY_conj(&invf,&invf);  // 1/f
+    FP12_YYY_cmove(f,&invf,(int)(m&1));
+}
+
+
+
+/* test x==0 ? */
+/* SU= 8 */
+int FP12_YYY_iszilch(FP12_YYY *x)
+{
+    if (FP4_YYY_iszilch(&(x->a)) && FP4_YYY_iszilch(&(x->b)) && FP4_YYY_iszilch(&(x->c))) return 1;
+    return 0;
+}
+
+/* test x==1 ? */
+/* SU= 8 */
+int FP12_YYY_isunity(FP12_YYY *x)
+{
+    if (FP4_YYY_isunity(&(x->a)) && FP4_YYY_iszilch(&(x->b)) && FP4_YYY_iszilch(&(x->c))) return 1;
+    return 0;
+}
+
+/* FP12 copy w=x */
+/* SU= 16 */
+void FP12_YYY_copy(FP12_YYY *w,FP12_YYY *x)
+{
+    if (x==w) return;
+    FP4_YYY_copy(&(w->a),&(x->a));
+    FP4_YYY_copy(&(w->b),&(x->b));
+    FP4_YYY_copy(&(w->c),&(x->c));
+}
+
+/* FP12 w=1 */
+/* SU= 8 */
+void FP12_YYY_one(FP12_YYY *w)
+{
+    FP4_YYY_one(&(w->a));
+    FP4_YYY_zero(&(w->b));
+    FP4_YYY_zero(&(w->c));
+}
+
+/* return 1 if x==y, else 0 */
+/* SU= 16 */
+int FP12_YYY_equals(FP12_YYY *x,FP12_YYY *y)
+{
+    if (FP4_YYY_equals(&(x->a),&(y->a)) && FP4_YYY_equals(&(x->b),&(y->b)) && FP4_YYY_equals(&(x->b),&(y->b)))
+        return 1;
+    return 0;
+}
+
+/* Set w=conj(x) */
+/* SU= 8 */
+void FP12_YYY_conj(FP12_YYY *w,FP12_YYY *x)
+{
+    FP12_YYY_copy(w,x);
+    FP4_YYY_conj(&(w->a),&(w->a));
+    FP4_YYY_nconj(&(w->b),&(w->b));
+    FP4_YYY_conj(&(w->c),&(w->c));
+}
+
+/* Create FP12 from FP4 */
+/* SU= 8 */
+void FP12_YYY_from_FP4(FP12_YYY *w,FP4_YYY *a)
+{
+    FP4_YYY_copy(&(w->a),a);
+    FP4_YYY_zero(&(w->b));
+    FP4_YYY_zero(&(w->c));
+}
+
+/* Create FP12 from 3 FP4's */
+/* SU= 16 */
+void FP12_YYY_from_FP4s(FP12_YYY *w,FP4_YYY *a,FP4_YYY *b,FP4_YYY *c)
+{
+    FP4_YYY_copy(&(w->a),a);
+    FP4_YYY_copy(&(w->b),b);
+    FP4_YYY_copy(&(w->c),c);
+}
+
+/* Granger-Scott Unitary Squaring. This does not benefit from lazy reduction */
+/* SU= 600 */
+void FP12_YYY_usqr(FP12_YYY *w,FP12_YYY *x)
+{
+    FP4_YYY A,B,C,D;
+
+    FP4_YYY_copy(&A,&(x->a));
+
+    FP4_YYY_sqr(&(w->a),&(x->a));
+    FP4_YYY_add(&D,&(w->a),&(w->a));
+    FP4_YYY_add(&(w->a),&D,&(w->a));
+
+    FP4_YYY_norm(&(w->a));
+    FP4_YYY_nconj(&A,&A);
+
+    FP4_YYY_add(&A,&A,&A);
+    FP4_YYY_add(&(w->a),&(w->a),&A);
+    FP4_YYY_sqr(&B,&(x->c));
+    FP4_YYY_times_i(&B);
+
+    FP4_YYY_add(&D,&B,&B);
+    FP4_YYY_add(&B,&B,&D);
+    FP4_YYY_norm(&B);
+
+    FP4_YYY_sqr(&C,&(x->b));
+
+    FP4_YYY_add(&D,&C,&C);
+    FP4_YYY_add(&C,&C,&D);
+
+    FP4_YYY_norm(&C);
+    FP4_YYY_conj(&(w->b),&(x->b));
+    FP4_YYY_add(&(w->b),&(w->b),&(w->b));
+    FP4_YYY_nconj(&(w->c),&(x->c));
+
+    FP4_YYY_add(&(w->c),&(w->c),&(w->c));
+    FP4_YYY_add(&(w->b),&B,&(w->b));
+    FP4_YYY_add(&(w->c),&C,&(w->c));
+
+    FP12_YYY_reduce(w);	    /* reduce here as in pow function repeated squarings would trigger multiple reductions */
+}
+
+/* FP12 squaring w=x^2 */
+/* SU= 600 */
+void FP12_YYY_sqr(FP12_YYY *w,FP12_YYY *x)
+{
+    /* Use Chung-Hasan SQR2 method from http://cacr.uwaterloo.ca/techreports/2006/cacr2006-24.pdf */
+
+    FP4_YYY A,B,C,D;
+
+    FP4_YYY_sqr(&A,&(x->a));
+    FP4_YYY_mul(&B,&(x->b),&(x->c));
+    FP4_YYY_add(&B,&B,&B);
+    FP4_YYY_norm(&B);
+    FP4_YYY_sqr(&C,&(x->c));
+
+    FP4_YYY_mul(&D,&(x->a),&(x->b));
+    FP4_YYY_add(&D,&D,&D);
+    FP4_YYY_add(&(w->c),&(x->a),&(x->c));
+    FP4_YYY_add(&(w->c),&(x->b),&(w->c));
+    FP4_YYY_norm(&(w->c));
+
+    FP4_YYY_sqr(&(w->c),&(w->c));
+
+    FP4_YYY_copy(&(w->a),&A);
+    FP4_YYY_add(&A,&A,&B);
+
+    FP4_YYY_norm(&A);
+
+    FP4_YYY_add(&A,&A,&C);
+    FP4_YYY_add(&A,&A,&D);
+
+    FP4_YYY_norm(&A);
+    FP4_YYY_neg(&A,&A);
+    FP4_YYY_times_i(&B);
+    FP4_YYY_times_i(&C);
+
+    FP4_YYY_add(&(w->a),&(w->a),&B);
+    FP4_YYY_add(&(w->b),&C,&D);
+    FP4_YYY_add(&(w->c),&(w->c),&A);
+
+    FP12_YYY_norm(w);
+}
+
+/* FP12 full multiplication w=w*y */
+/* SU= 896 */
+void FP12_YYY_mul(FP12_YYY *w,FP12_YYY *y)
+{
+    FP4_YYY z0,z1,z2,z3,t0,t1;
+
+    FP4_YYY_mul(&z0,&(w->a),&(y->a));
+    FP4_YYY_mul(&z2,&(w->b),&(y->b));
+
+    FP4_YYY_add(&t0,&(w->a),&(w->b));
+    FP4_YYY_add(&t1,&(y->a),&(y->b));
+
+    FP4_YYY_norm(&t0);
+    FP4_YYY_norm(&t1);
+
+    FP4_YYY_mul(&z1,&t0,&t1);
+    FP4_YYY_add(&t0,&(w->b),&(w->c));
+    FP4_YYY_add(&t1,&(y->b),&(y->c));
+
+    FP4_YYY_norm(&t0);
+    FP4_YYY_norm(&t1);
+
+    FP4_YYY_mul(&z3,&t0,&t1);
+
+    FP4_YYY_neg(&t0,&z0);
+    FP4_YYY_neg(&t1,&z2);
+
+    FP4_YYY_add(&z1,&z1,&t0);   // z1=z1-z0
+    FP4_YYY_add(&(w->b),&z1,&t1);    // z1=z1-z2
+    FP4_YYY_add(&z3,&z3,&t1);        // z3=z3-z2
+    FP4_YYY_add(&z2,&z2,&t0);        // z2=z2-z0
+
+    FP4_YYY_add(&t0,&(w->a),&(w->c));
+    FP4_YYY_add(&t1,&(y->a),&(y->c));
+
+    FP4_YYY_norm(&t0);
+    FP4_YYY_norm(&t1);
+
+    FP4_YYY_mul(&t0,&t1,&t0);
+    FP4_YYY_add(&z2,&z2,&t0);
+
+    FP4_YYY_mul(&t0,&(w->c),&(y->c));
+    FP4_YYY_neg(&t1,&t0);
+
+    FP4_YYY_add(&(w->c),&z2,&t1);
+    FP4_YYY_add(&z3,&z3,&t1);
+    FP4_YYY_times_i(&t0);
+    FP4_YYY_add(&(w->b),&(w->b),&t0);
+    FP4_YYY_norm(&z3);
+    FP4_YYY_times_i(&z3);
+    FP4_YYY_add(&(w->a),&z0,&z3);
+
+    FP12_YYY_norm(w);
+}
+
+/* FP12 multiplication w=w*y */
+/* SU= 744 */
+/* catering for special case that arises from special form of ATE pairing line function */
+void FP12_YYY_smul(FP12_YYY *w,FP12_YYY *y,int type)
+{
+    FP4_YYY z0,z1,z2,z3,t0,t1;
+
+    if (type==D_TYPE)
+    {
+        // y->c is 0
+
+        FP4_YYY_copy(&z3,&(w->b));
+        FP4_YYY_mul(&z0,&(w->a),&(y->a));
+
+        FP4_YYY_pmul(&z2,&(w->b),&(y->b).a);
+        FP4_YYY_add(&(w->b),&(w->a),&(w->b));
+        FP4_YYY_copy(&t1,&(y->a));
+        FP2_YYY_add(&t1.a,&t1.a,&(y->b).a);
+
+        FP4_YYY_norm(&t1);
+        FP4_YYY_norm(&(w->b));
+
+        FP4_YYY_mul(&(w->b),&(w->b),&t1);
+        FP4_YYY_add(&z3,&z3,&(w->c));
+        FP4_YYY_norm(&z3);
+        FP4_YYY_pmul(&z3,&z3,&(y->b).a);
+        FP4_YYY_neg(&t0,&z0);
+        FP4_YYY_neg(&t1,&z2);
+
+        FP4_YYY_add(&(w->b),&(w->b),&t0);   // z1=z1-z0
+        FP4_YYY_add(&(w->b),&(w->b),&t1);   // z1=z1-z2
+
+        FP4_YYY_add(&z3,&z3,&t1);        // z3=z3-z2
+        FP4_YYY_add(&z2,&z2,&t0);        // z2=z2-z0
+
+        FP4_YYY_add(&t0,&(w->a),&(w->c));
+
+        FP4_YYY_norm(&t0);
+        FP4_YYY_norm(&z3);
+
+        FP4_YYY_mul(&t0,&(y->a),&t0);
+        FP4_YYY_add(&(w->c),&z2,&t0);
+
+        FP4_YYY_times_i(&z3);
+        FP4_YYY_add(&(w->a),&z0,&z3);
+    }
+
+    if (type==M_TYPE)
+    {
+        // y->b is zero
+        FP4_YYY_mul(&z0,&(w->a),&(y->a));
+        FP4_YYY_add(&t0,&(w->a),&(w->b));
+        FP4_YYY_norm(&t0);
+
+        FP4_YYY_mul(&z1,&t0,&(y->a));
+        FP4_YYY_add(&t0,&(w->b),&(w->c));
+        FP4_YYY_norm(&t0);
+
+        FP4_YYY_pmul(&z3,&t0,&(y->c).b);
+        FP4_YYY_times_i(&z3);
+
+        FP4_YYY_neg(&t0,&z0);
+        FP4_YYY_add(&z1,&z1,&t0);   // z1=z1-z0
+
+        FP4_YYY_copy(&(w->b),&z1);
+
+        FP4_YYY_copy(&z2,&t0);
+
+        FP4_YYY_add(&t0,&(w->a),&(w->c));
+        FP4_YYY_add(&t1,&(y->a),&(y->c));
+
+        FP4_YYY_norm(&t0);
+        FP4_YYY_norm(&t1);
+
+        FP4_YYY_mul(&t0,&t1,&t0);
+        FP4_YYY_add(&z2,&z2,&t0);
+
+        FP4_YYY_pmul(&t0,&(w->c),&(y->c).b);
+        FP4_YYY_times_i(&t0);
+        FP4_YYY_neg(&t1,&t0);
+        FP4_YYY_times_i(&t0);
+
+        FP4_YYY_add(&(w->c),&z2,&t1);
+        FP4_YYY_add(&z3,&z3,&t1);
+
+        FP4_YYY_add(&(w->b),&(w->b),&t0);
+        FP4_YYY_norm(&z3);
+        FP4_YYY_times_i(&z3);
+        FP4_YYY_add(&(w->a),&z0,&z3);
+    }
+    FP12_YYY_norm(w);
+}
+
+/* Set w=1/x */
+/* SU= 600 */
+void FP12_YYY_inv(FP12_YYY *w,FP12_YYY *x)
+{
+    FP4_YYY f0,f1,f2,f3;
+
+    FP4_YYY_sqr(&f0,&(x->a));
+    FP4_YYY_mul(&f1,&(x->b),&(x->c));
+    FP4_YYY_times_i(&f1);
+    FP4_YYY_sub(&f0,&f0,&f1);  /* y.a */
+    FP4_YYY_norm(&f0);
+
+    FP4_YYY_sqr(&f1,&(x->c));
+    FP4_YYY_times_i(&f1);
+    FP4_YYY_mul(&f2,&(x->a),&(x->b));
+    FP4_YYY_sub(&f1,&f1,&f2);  /* y.b */
+    FP4_YYY_norm(&f1);
+
+    FP4_YYY_sqr(&f2,&(x->b));
+    FP4_YYY_mul(&f3,&(x->a),&(x->c));
+    FP4_YYY_sub(&f2,&f2,&f3);  /* y.c */
+    FP4_YYY_norm(&f2);
+
+    FP4_YYY_mul(&f3,&(x->b),&f2);
+    FP4_YYY_times_i(&f3);
+    FP4_YYY_mul(&(w->a),&f0,&(x->a));
+    FP4_YYY_add(&f3,&(w->a),&f3);
+    FP4_YYY_mul(&(w->c),&f1,&(x->c));
+    FP4_YYY_times_i(&(w->c));
+
+    FP4_YYY_add(&f3,&(w->c),&f3);
+    FP4_YYY_norm(&f3);
+
+    FP4_YYY_inv(&f3,&f3);
+
+    FP4_YYY_mul(&(w->a),&f0,&f3);
+    FP4_YYY_mul(&(w->b),&f1,&f3);
+    FP4_YYY_mul(&(w->c),&f2,&f3);
+
+}
+
+/* constant time powering by small integer of max length bts */
+
+void FP12_YYY_pinpow(FP12_YYY *r,int e,int bts)
+{
+    int i,b;
+    FP12_YYY R[2];
+
+    FP12_YYY_one(&R[0]);
+    FP12_YYY_copy(&R[1],r);
+
+    for (i=bts-1; i>=0; i--)
+    {
+        b=(e>>i)&1;
+        FP12_YYY_mul(&R[1-b],&R[b]);
+        FP12_YYY_usqr(&R[b],&R[b]);
+    }
+    FP12_YYY_copy(r,&R[0]);
+}
+
+/* Compressed powering of unitary elements y=x^(e mod r) */
+
+void FP12_YYY_compow(FP4_YYY *c,FP12_YYY *x,BIG_XXX e,BIG_XXX r)
+{
+    FP12_YYY g1,g2;
+    FP4_YYY cp,cpm1,cpm2;
+    FP2_YYY f;
+    BIG_XXX q,a,b,m;
+
+    BIG_XXX_rcopy(a,Fra_YYY);
+    BIG_XXX_rcopy(b,Frb_YYY);
+    FP2_YYY_from_BIGs(&f,a,b);
+
+    BIG_XXX_rcopy(q,Modulus_YYY);
+
+    FP12_YYY_copy(&g1,x);
+    FP12_YYY_copy(&g2,x);
+
+    BIG_XXX_copy(m,q);
+    BIG_XXX_mod(m,r);
+
+    BIG_XXX_copy(a,e);
+    BIG_XXX_mod(a,m);
+
+    BIG_XXX_copy(b,e);
+    BIG_XXX_sdiv(b,m);
+
+    FP12_YYY_trace(c,&g1);
+
+    if (BIG_XXX_iszilch(b))
+    {
+        FP4_YYY_xtr_pow(c,c,e);
+        return;
+    }
+
+
+    FP12_YYY_frob(&g2,&f);
+    FP12_YYY_trace(&cp,&g2);
+
+    FP12_YYY_conj(&g1,&g1);
+    FP12_YYY_mul(&g2,&g1);
+    FP12_YYY_trace(&cpm1,&g2);
+    FP12_YYY_mul(&g2,&g1);
+    FP12_YYY_trace(&cpm2,&g2);
+
+    FP4_YYY_xtr_pow2(c,&cp,c,&cpm1,&cpm2,a,b);
+
+}
+
+
+/* SU= 528 */
+/* set r=a^b */
+/* Note this is simple square and multiply, so not side-channel safe */
+
+void FP12_YYY_pow(FP12_YYY *r,FP12_YYY *a,BIG_XXX b)
+{
+    FP12_YYY w;
+    BIG_XXX b3;
+    int i,nb,bt;
+    BIG_XXX_norm(b);
+    BIG_XXX_pmul(b3,b,3);
+    BIG_XXX_norm(b3);
+
+    FP12_YYY_copy(&w,a);
+
+    nb=BIG_XXX_nbits(b3);
+    for (i=nb-2; i>=1; i--)
+    {
+        FP12_YYY_usqr(&w,&w);
+        bt=BIG_XXX_bit(b3,i)-BIG_XXX_bit(b,i);
+        if (bt==1)
+            FP12_YYY_mul(&w,a);
+        if (bt==-1)
+        {
+            FP12_YYY_conj(a,a);
+            FP12_YYY_mul(&w,a);
+            FP12_YYY_conj(a,a);
+        }
+    }
+
+    FP12_YYY_copy(r,&w);
+    FP12_YYY_reduce(r);
+}
+
+/* p=q0^u0.q1^u1.q2^u2.q3^u3 */
+/* Side channel attack secure */
+// Bos & Costello https://eprint.iacr.org/2013/458.pdf
+// Faz-Hernandez & Longa & Sanchez  https://eprint.iacr.org/2013/158.pdf
+
+void FP12_YYY_pow4(FP12_YYY *p,FP12_YYY *q,BIG_XXX u[4])
+{
+    int i,j,k,nb,pb,bt;
+    FP12_YYY g[8],r;
+    BIG_XXX t[4],mt;
+    sign8 w[NLEN_XXX*BASEBITS_XXX+1];
+    sign8 s[NLEN_XXX*BASEBITS_XXX+1];
+
+    for (i=0; i<4; i++)
+        BIG_XXX_copy(t[i],u[i]);
+
+
+    // Precomputed table
+    FP12_YYY_copy(&g[0],&q[0]); // q[0]
+    FP12_YYY_copy(&g[1],&g[0]);
+    FP12_YYY_mul(&g[1],&q[1]);	// q[0].q[1]
+    FP12_YYY_copy(&g[2],&g[0]);
+    FP12_YYY_mul(&g[2],&q[2]);	// q[0].q[2]
+    FP12_YYY_copy(&g[3],&g[1]);
+    FP12_YYY_mul(&g[3],&q[2]);	// q[0].q[1].q[2]
+    FP12_YYY_copy(&g[4],&g[0]);
+    FP12_YYY_mul(&g[4],&q[3]);  // q[0].q[3]
+    FP12_YYY_copy(&g[5],&g[1]);
+    FP12_YYY_mul(&g[5],&q[3]);	// q[0].q[1].q[3]
+    FP12_YYY_copy(&g[6],&g[2]);
+    FP12_YYY_mul(&g[6],&q[3]);	// q[0].q[2].q[3]
+    FP12_YYY_copy(&g[7],&g[3]);
+    FP12_YYY_mul(&g[7],&q[3]);	// q[0].q[1].q[2].q[3]
+
+    // Make it odd
+    pb=1-BIG_XXX_parity(t[0]);
+    BIG_XXX_inc(t[0],pb);
+    BIG_XXX_norm(t[0]);
+
+    // Number of bits
+    BIG_XXX_zero(mt);
+    for (i=0; i<4; i++)
+    {
+        BIG_XXX_or(mt,mt,t[i]);
+    }
+    nb=1+BIG_XXX_nbits(mt);
+
+    // Sign pivot
+    s[nb-1]=1;
+    for (i=0; i<nb-1; i++)
+    {
+        BIG_XXX_fshr(t[0],1);
+        s[i]=2*BIG_XXX_parity(t[0])-1;
+    }
+
+    // Recoded exponent
+    for (i=0; i<nb; i++)
+    {
+        w[i]=0;
+        k=1;
+        for (j=1; j<4; j++)
+        {
+            bt=s[i]*BIG_XXX_parity(t[j]);
+            BIG_XXX_fshr(t[j],1);
+
+            BIG_XXX_dec(t[j],(bt>>1));
+            BIG_XXX_norm(t[j]);
+            w[i]+=bt*k;
+            k*=2;
+        }
+    }
+
+    // Main loop
+    FP12_YYY_select(p,g,2*w[nb-1]+1);
+    for (i=nb-2; i>=0; i--)
+    {
+        FP12_YYY_select(&r,g,2*w[i]+s[i]);
+        FP12_YYY_usqr(p,p);
+        FP12_YYY_mul(p,&r);
+    }
+    // apply correction
+    FP12_YYY_conj(&r,&q[0]);
+    FP12_YYY_mul(&r,p);
+    FP12_YYY_cmove(p,&r,pb);
+
+    FP12_YYY_reduce(p);
+}
+
+/* Set w=w^p using Frobenius */
+/* SU= 160 */
+void FP12_YYY_frob(FP12_YYY *w,FP2_YYY *f)
+{
+    FP2_YYY f2,f3;
+    FP2_YYY_sqr(&f2,f);     /* f2=f^2 */
+    FP2_YYY_mul(&f3,&f2,f); /* f3=f^3 */
+
+    FP4_YYY_frob(&(w->a),&f3);
+    FP4_YYY_frob(&(w->b),&f3);
+    FP4_YYY_frob(&(w->c),&f3);
+
+    FP4_YYY_pmul(&(w->b),&(w->b),f);
+    FP4_YYY_pmul(&(w->c),&(w->c),&f2);
+}
+
+/* SU= 8 */
+/* normalise all components of w */
+void FP12_YYY_norm(FP12_YYY *w)
+{
+    FP4_YYY_norm(&(w->a));
+    FP4_YYY_norm(&(w->b));
+    FP4_YYY_norm(&(w->c));
+}
+
+/* SU= 8 */
+/* reduce all components of w */
+void FP12_YYY_reduce(FP12_YYY *w)
+{
+    FP4_YYY_reduce(&(w->a));
+    FP4_YYY_reduce(&(w->b));
+    FP4_YYY_reduce(&(w->c));
+}
+
+/* trace function w=trace(x) */
+/* SU= 8 */
+void FP12_YYY_trace(FP4_YYY *w,FP12_YYY *x)
+{
+    FP4_YYY_imul(w,&(x->a),3);
+    FP4_YYY_reduce(w);
+}
+
+/* SU= 8 */
+/* Output w in hex */
+void FP12_YYY_output(FP12_YYY *w)
+{
+    printf("[");
+    FP4_YYY_output(&(w->a));
+    printf(",");
+    FP4_YYY_output(&(w->b));
+    printf(",");
+    FP4_YYY_output(&(w->c));
+    printf("]");
+}
+
+/* SU= 64 */
+/* Convert g to octet string w */
+void FP12_YYY_toOctet(octet *W,FP12_YYY *g)
+{
+    BIG_XXX a;
+    W->len=12*MODBYTES_XXX;
+
+    FP_YYY_redc(a,&(g->a.a.a));
+    BIG_XXX_toBytes(&(W->val[0]),a);
+    FP_YYY_redc(a,&(g->a.a.b));
+    BIG_XXX_toBytes(&(W->val[MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->a.b.a));
+    BIG_XXX_toBytes(&(W->val[2*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->a.b.b));
+    BIG_XXX_toBytes(&(W->val[3*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->b.a.a));
+    BIG_XXX_toBytes(&(W->val[4*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->b.a.b));
+    BIG_XXX_toBytes(&(W->val[5*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->b.b.a));
+    BIG_XXX_toBytes(&(W->val[6*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->b.b.b));
+    BIG_XXX_toBytes(&(W->val[7*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->c.a.a));
+    BIG_XXX_toBytes(&(W->val[8*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->c.a.b));
+    BIG_XXX_toBytes(&(W->val[9*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->c.b.a));
+    BIG_XXX_toBytes(&(W->val[10*MODBYTES_XXX]),a);
+    FP_YYY_redc(a,&(g->c.b.b));
+    BIG_XXX_toBytes(&(W->val[11*MODBYTES_XXX]),a);
+}
+
+/* SU= 24 */
+/* Restore g from octet string w */
+void FP12_YYY_fromOctet(FP12_YYY *g,octet *W)
+{
+    BIG_XXX b;
+    BIG_XXX_fromBytes(b,&W->val[0]);
+    FP_YYY_nres(&(g->a.a.a),b);
+    BIG_XXX_fromBytes(b,&W->val[MODBYTES_XXX]);
+    FP_YYY_nres(&(g->a.a.b),b);
+    BIG_XXX_fromBytes(b,&W->val[2*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->a.b.a),b);
+    BIG_XXX_fromBytes(b,&W->val[3*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->a.b.b),b);
+    BIG_XXX_fromBytes(b,&W->val[4*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->b.a.a),b);
+    BIG_XXX_fromBytes(b,&W->val[5*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->b.a.b),b);
+    BIG_XXX_fromBytes(b,&W->val[6*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->b.b.a),b);
+    BIG_XXX_fromBytes(b,&W->val[7*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->b.b.b),b);
+    BIG_XXX_fromBytes(b,&W->val[8*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->c.a.a),b);
+    BIG_XXX_fromBytes(b,&W->val[9*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->c.a.b),b);
+    BIG_XXX_fromBytes(b,&W->val[10*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->c.b.a),b);
+    BIG_XXX_fromBytes(b,&W->val[11*MODBYTES_XXX]);
+    FP_YYY_nres(&(g->c.b.b),b);
+}
+
+/* Move b to a if d=1 */
+void FP12_YYY_cmove(FP12_YYY *f,FP12_YYY *g,int d)
+{
+    FP4_YYY_cmove(&(f->a),&(g->a),d);
+    FP4_YYY_cmove(&(f->b),&(g->b),d);
+    FP4_YYY_cmove(&(f->c),&(g->c),d);
+}
+
+/*
+int main(){
+		FP2_YYY f,w0,w1;
+		FP4_YYY t0,t1,t2;
+		FP12_YYY w,t,lv;
+		BIG_XXX a,b;
+		BIG_XXX p;
+
+		//Test w^(P^4) = w mod p^2
+//		BIG_XXX_randomnum(a);
+//		BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+	BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,1); BIG_XXX_inc(b,2); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w0,a,b);
+
+//		BIG_XXX_randomnum(a); BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+	BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,3); BIG_XXX_inc(b,4); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w1,a,b);
+
+		FP4_YYY_from_FP2s(&t0,&w0,&w1);
+		FP4_YYY_reduce(&t0);
+
+//		BIG_XXX_randomnum(a);
+//		BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+		BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,5); BIG_XXX_inc(b,6); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w0,a,b);
+
+//		BIG_XXX_randomnum(a); BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+
+		BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,7); BIG_XXX_inc(b,8); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w1,a,b);
+
+		FP4_YYY_from_FP2s(&t1,&w0,&w1);
+		FP4_YYY_reduce(&t1);
+
+//		BIG_XXX_randomnum(a);
+//		BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+		BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,9); BIG_XXX_inc(b,10); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w0,a,b);
+
+//		BIG_XXX_randomnum(a); BIG_XXX_randomnum(b);
+//		BIG_XXX_mod(a,Modulus); BIG_XXX_mod(b,Modulus);
+		BIG_XXX_zero(a); BIG_XXX_zero(b); BIG_XXX_inc(a,11); BIG_XXX_inc(b,12); FP_YYY_nres(a); FP_YYY_nres(b);
+		FP2_YYY_from_zps(&w1,a,b);
+
+		FP4_YYY_from_FP2s(&t2,&w0,&w1);
+		FP4_YYY_reduce(&t2);
+
+		FP12_YYY_from_FP4s(&w,&t0,&t1,&t2);
+
+		FP12_YYY_copy(&t,&w);
+
+		printf("w= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+
+		BIG_XXX_rcopy(p,Modulus);
+		//BIG_XXX_zero(p); BIG_XXX_inc(p,7);
+
+		FP12_YYY_pow(&w,&w,p);
+
+		printf("w^p= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+
+		FP2_YYY_gfc(&f,12);
+		FP12_YYY_frob(&t,&f);
+		printf("w^p= ");
+		FP12_YYY_output(&t);
+		printf("\n");
+
+//exit(0);
+
+		FP12_YYY_pow(&w,&w,p);
+		//printf("w^p^2= ");
+		//FP12_YYY_output(&w);
+		//printf("\n");
+		FP12_YYY_pow(&w,&w,p);
+		//printf("w^p^3= ");
+		//FP12_YYY_output(&w);
+		//printf("\n");
+		FP12_YYY_pow(&w,&w,p);
+		FP12_YYY_pow(&w,&w,p);
+		FP12_YYY_pow(&w,&w,p);
+		printf("w^p^6= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+		FP12_YYY_pow(&w,&w,p);
+		FP12_YYY_pow(&w,&w,p);
+		printf("w^p^8= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+		FP12_YYY_pow(&w,&w,p);
+		FP12_YYY_pow(&w,&w,p);
+		FP12_YYY_pow(&w,&w,p);
+		printf("w^p^11= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+
+	//	BIG_XXX_zero(p); BIG_XXX_inc(p,7); BIG_XXX_norm(p);
+		FP12_YYY_pow(&w,&w,p);
+
+		printf("w^p12= ");
+		FP12_YYY_output(&w);
+		printf("\n");
+//exit(0);
+
+		FP12_YYY_inv(&t,&w);
+		printf("1/w mod p^4 = ");
+		FP12_YYY_output(&t);
+		printf("\n");
+
+		FP12_YYY_inv(&w,&t);
+		printf("1/(1/w) mod p^4 = ");
+		FP12_YYY_output(&w);
+		printf("\n");
+
+
+
+	FP12_YYY_inv(&lv,&w);
+//printf("w= "); FP12_YYY_output(&w); printf("\n");
+	FP12_YYY_conj(&w,&w);
+//printf("w= "); FP12_YYY_output(&w); printf("\n");
+//exit(0);
+	FP12_YYY_mul(&w,&w,&lv);
+//printf("w= "); FP12_YYY_output(&w); printf("\n");
+	FP12_YYY_copy(&lv,&w);
+	FP12_YYY_frob(&w,&f);
+	FP12_YYY_frob(&w,&f);
+	FP12_YYY_mul(&w,&w,&lv);
+
+//printf("w= "); FP12_YYY_output(&w); printf("\n");
+//exit(0);
+
+w.unitary=0;
+FP12_YYY_conj(&lv,&w);
+	printf("rx= "); FP12_YYY_output(&lv); printf("\n");
+FP12_YYY_inv(&lv,&w);
+	printf("ry= "); FP12_YYY_output(&lv); printf("\n");
+
+
+		return 0;
+}
+
+*/

http://git-wip-us.apache.org/repos/asf/incubator-milagro-crypto-c/blob/8d28d2c3/src/fp16.c.in
----------------------------------------------------------------------
diff --git a/src/fp16.c.in b/src/fp16.c.in
new file mode 100644
index 0000000..a548526
--- /dev/null
+++ b/src/fp16.c.in
@@ -0,0 +1,693 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/
+
+/* AMCL Fp^8 functions */
+
+/* FP16 elements are of the form a+ib, where i is sqrt(sqrt(-1+sqrt(-1))) */
+
+#include "fp16_YYY.h"
+
+
+/* test x==0 ? */
+int FP16_YYY_iszilch(FP16_YYY *x)
+{
+    if (FP8_YYY_iszilch(&(x->a)) && FP8_YYY_iszilch(&(x->b))) return 1;
+    return 0;
+}
+
+/* test x==1 ? */
+int FP16_YYY_isunity(FP16_YYY *x)
+{
+    if (FP8_YYY_isunity(&(x->a)) && FP8_YYY_iszilch(&(x->b))) return 1;
+    return 0;
+}
+
+/* test is w real? That is in a+ib test b is zero */
+int FP16_YYY_isreal(FP16_YYY *w)
+{
+    return FP8_YYY_iszilch(&(w->b));
+}
+
+/* return 1 if x==y, else 0 */
+int FP16_YYY_equals(FP16_YYY *x,FP16_YYY *y)
+{
+    if (FP8_YYY_equals(&(x->a),&(y->a)) && FP8_YYY_equals(&(x->b),&(y->b)))
+        return 1;
+    return 0;
+}
+
+/* set FP16 from two FP8s */
+void FP16_YYY_from_FP8s(FP16_YYY *w,FP8_YYY * x,FP8_YYY* y)
+{
+    FP8_YYY_copy(&(w->a), x);
+    FP8_YYY_copy(&(w->b), y);
+}
+
+/* set FP16 from FP8 */
+void FP16_YYY_from_FP8(FP16_YYY *w,FP8_YYY *x)
+{
+    FP8_YYY_copy(&(w->a), x);
+    FP8_YYY_zero(&(w->b));
+}
+
+/* set high part of FP16 from FP8 */
+void FP16_YYY_from_FP8H(FP16_YYY *w,FP8_YYY *x)
+{
+    FP8_YYY_copy(&(w->b), x);
+    FP8_YYY_zero(&(w->a));
+}
+
+/* FP16 copy w=x */
+void FP16_YYY_copy(FP16_YYY *w,FP16_YYY *x)
+{
+    if (w==x) return;
+    FP8_YYY_copy(&(w->a), &(x->a));
+    FP8_YYY_copy(&(w->b), &(x->b));
+}
+
+/* FP16 w=0 */
+void FP16_YYY_zero(FP16_YYY *w)
+{
+    FP8_YYY_zero(&(w->a));
+    FP8_YYY_zero(&(w->b));
+}
+
+/* FP16 w=1 */
+void FP16_YYY_one(FP16_YYY *w)
+{
+    FP8_YYY_one(&(w->a));
+    FP8_YYY_zero(&(w->b));
+}
+
+/* Set w=-x */
+void FP16_YYY_neg(FP16_YYY *w,FP16_YYY *x)
+{
+    /* Just one field neg */
+    FP8_YYY m,t;
+    FP16_YYY_norm(x);
+    FP8_YYY_add(&m,&(x->a),&(x->b));
+    FP8_YYY_norm(&m);
+    FP8_YYY_neg(&m,&m);
+    FP8_YYY_add(&t,&m,&(x->b));
+    FP8_YYY_add(&(w->b),&m,&(x->a));
+    FP8_YYY_copy(&(w->a),&t);
+    FP16_YYY_norm(w);
+}
+
+/* Set w=conj(x) */
+void FP16_YYY_conj(FP16_YYY *w,FP16_YYY *x)
+{
+    FP8_YYY_copy(&(w->a), &(x->a));
+    FP8_YYY_neg(&(w->b), &(x->b));
+    FP16_YYY_norm(w);
+}
+
+/* Set w=-conj(x) */
+void FP16_YYY_nconj(FP16_YYY *w,FP16_YYY *x)
+{
+    FP8_YYY_copy(&(w->b),&(x->b));
+    FP8_YYY_neg(&(w->a), &(x->a));
+    FP16_YYY_norm(w);
+}
+
+/* Set w=x+y */
+void FP16_YYY_add(FP16_YYY *w,FP16_YYY *x,FP16_YYY *y)
+{
+    FP8_YYY_add(&(w->a), &(x->a), &(y->a));
+    FP8_YYY_add(&(w->b), &(x->b), &(y->b));
+}
+
+/* Set w=x-y */
+/* Input y MUST be normed */
+void FP16_YYY_sub(FP16_YYY *w,FP16_YYY *x,FP16_YYY *y)
+{
+    FP16_YYY my;
+
+    FP16_YYY_neg(&my, y);
+    FP16_YYY_add(w, x, &my);
+
+}
+
+/* reduce all components of w mod Modulus */
+void FP16_YYY_reduce(FP16_YYY *w)
+{
+    FP8_YYY_reduce(&(w->a));
+    FP8_YYY_reduce(&(w->b));
+}
+
+/* normalise all elements of w */
+void FP16_YYY_norm(FP16_YYY *w)
+{
+    FP8_YYY_norm(&(w->a));
+    FP8_YYY_norm(&(w->b));
+}
+
+/* Set w=s*x, where s is FP8 */
+void FP16_YYY_pmul(FP16_YYY *w,FP16_YYY *x,FP8_YYY *s)
+{
+    FP8_YYY_mul(&(w->a),&(x->a),s);
+    FP8_YYY_mul(&(w->b),&(x->b),s);
+}
+
+/* Set w=s*x, where s is FP2 */
+void FP16_YYY_qmul(FP16_YYY *w,FP16_YYY *x,FP2_YYY *s)
+{
+    FP8_YYY_qmul(&(w->a),&(x->a),s);
+    FP8_YYY_qmul(&(w->b),&(x->b),s);
+}
+
+/* Set w=s*x, where s is int */
+void FP16_YYY_imul(FP16_YYY *w,FP16_YYY *x,int s)
+{
+    FP8_YYY_imul(&(w->a),&(x->a),s);
+    FP8_YYY_imul(&(w->b),&(x->b),s);
+}
+
+/* Set w=x^2 */
+/* Input MUST be normed  */
+void FP16_YYY_sqr(FP16_YYY *w,FP16_YYY *x)
+{
+    FP8_YYY t1,t2,t3;
+
+    FP8_YYY_mul(&t3,&(x->a),&(x->b)); /* norms x */
+    FP8_YYY_copy(&t2,&(x->b));
+    FP8_YYY_add(&t1,&(x->a),&(x->b));
+    FP8_YYY_times_i(&t2);
+
+    FP8_YYY_add(&t2,&(x->a),&t2);
+
+    FP8_YYY_norm(&t1);  // 2
+    FP8_YYY_norm(&t2);  // 2
+
+    FP8_YYY_mul(&(w->a),&t1,&t2);
+
+    FP8_YYY_copy(&t2,&t3);
+    FP8_YYY_times_i(&t2);
+
+    FP8_YYY_add(&t2,&t2,&t3);
+
+    FP8_YYY_norm(&t2);  // 2
+    FP8_YYY_neg(&t2,&t2);
+    FP8_YYY_add(&(w->a),&(w->a),&t2);  /* a=(a+b)(a+i^2.b)-i^2.ab-ab = a*a+ib*ib */
+    FP8_YYY_add(&(w->b),&t3,&t3);  /* b=2ab */
+
+    FP16_YYY_norm(w);
+}
+
+/* Set w=x*y */
+/* Inputs MUST be normed  */
+void FP16_YYY_mul(FP16_YYY *w,FP16_YYY *x,FP16_YYY *y)
+{
+
+    FP8_YYY t1,t2,t3,t4;
+    FP8_YYY_mul(&t1,&(x->a),&(y->a));
+    FP8_YYY_mul(&t2,&(x->b),&(y->b));
+
+    FP8_YYY_add(&t3,&(y->b),&(y->a));
+    FP8_YYY_add(&t4,&(x->b),&(x->a));
+
+    FP8_YYY_norm(&t4); // 2
+    FP8_YYY_norm(&t3); // 2
+
+    FP8_YYY_mul(&t4,&t4,&t3); /* (xa+xb)(ya+yb) */
+
+    FP8_YYY_neg(&t3,&t1);  // 1
+    FP8_YYY_add(&t4,&t4,&t3);  //t4E=3
+    FP8_YYY_norm(&t4);
+
+    FP8_YYY_neg(&t3,&t2);  // 1
+    FP8_YYY_add(&(w->b),&t4,&t3); //wbE=3
+
+    FP8_YYY_times_i(&t2);
+    FP8_YYY_add(&(w->a),&t2,&t1);
+
+    FP16_YYY_norm(w);
+}
+
+/* output FP16 in format [a,b] */
+void FP16_YYY_output(FP16_YYY *w)
+{
+    printf("[");
+    FP8_YYY_output(&(w->a));
+    printf(",");
+    FP8_YYY_output(&(w->b));
+    printf("]");
+}
+
+void FP16_YYY_rawoutput(FP16_YYY *w)
+{
+    printf("[");
+    FP8_YYY_rawoutput(&(w->a));
+    printf(",");
+    FP8_YYY_rawoutput(&(w->b));
+    printf("]");
+}
+
+/* Set w=1/x */
+void FP16_YYY_inv(FP16_YYY *w,FP16_YYY *x)
+{
+    FP8_YYY t1,t2;
+    FP8_YYY_sqr(&t1,&(x->a));
+    FP8_YYY_sqr(&t2,&(x->b));
+    FP8_YYY_times_i(&t2);
+    FP8_YYY_norm(&t2);
+
+    FP8_YYY_sub(&t1,&t1,&t2);
+    FP8_YYY_norm(&t1);
+
+    FP8_YYY_inv(&t1,&t1);
+
+    FP8_YYY_mul(&(w->a),&t1,&(x->a));
+    FP8_YYY_neg(&t1,&t1);
+    FP8_YYY_norm(&t1);
+    FP8_YYY_mul(&(w->b),&t1,&(x->b));
+}
+
+/* w*=i where i = sqrt(sqrt(-1+sqrt(-1))) */
+void FP16_YYY_times_i(FP16_YYY *w)
+{
+    FP8_YYY s,t;
+    FP8_YYY_copy(&s,&(w->b));
+    FP8_YYY_copy(&t,&(w->a));
+    FP8_YYY_times_i(&s);
+    FP8_YYY_copy(&(w->a),&s);
+    FP8_YYY_copy(&(w->b),&t);
+    FP16_YYY_norm(w);
+}
+
+void FP16_YYY_times_i2(FP16_YYY *w)
+{
+    FP8_YYY_times_i(&(w->a));
+    FP8_YYY_times_i(&(w->b));
+}
+
+void FP16_YYY_times_i4(FP16_YYY *w)
+{
+    FP8_YYY_times_i2(&(w->a));
+    FP8_YYY_times_i2(&(w->b));
+}
+
+/* Set w=w^p using Frobenius */
+void FP16_YYY_frob(FP16_YYY *w,FP2_YYY *f)
+{
+    // f=(i+1)^(p-3)/8
+    FP2_YYY ff;
+
+    FP2_YYY_sqr(&ff,f);  // (i+1)^(p-3)/4
+    FP2_YYY_norm(&ff);
+
+    FP8_YYY_frob(&(w->a),&ff);
+    FP8_YYY_frob(&(w->b),&ff);
+
+    FP8_YYY_qmul(&(w->b),&(w->b),f);  // times (1+i)^(p-3)/8
+    FP8_YYY_times_i(&(w->b));		// (i+1)^(p-1)/8
+}
+
+/* Set r=a^b mod m */
+void FP16_YYY_pow(FP16_YYY *r,FP16_YYY * a,BIG_XXX b)
+{
+    FP16_YYY w;
+    BIG_XXX z,zilch;
+    int bt;
+
+    BIG_XXX_zero(zilch);
+    BIG_XXX_norm(b);
+    BIG_XXX_copy(z,b);
+    FP16_YYY_copy(&w,a);
+    FP16_YYY_one(r);
+
+    while(1)
+    {
+        bt=BIG_XXX_parity(z);
+        BIG_XXX_shr(z,1);
+        if (bt) FP16_YYY_mul(r,r,&w);
+        if (BIG_XXX_comp(z,zilch)==0) break;
+        FP16_YYY_sqr(&w,&w);
+    }
+    FP16_YYY_reduce(r);
+}
+
+/* Move b to a if d=1 */
+void FP16_YYY_cmove(FP16_YYY *f,FP16_YYY *g,int d)
+{
+    FP8_YYY_cmove(&(f->a),&(g->a),d);
+    FP8_YYY_cmove(&(f->b),&(g->b),d);
+}
+
+#if CURVE_SECURITY_ZZZ == 256
+
+/* XTR xtr_a function */
+void FP16_YYY_xtr_A(FP16_YYY *r,FP16_YYY *w,FP16_YYY *x,FP16_YYY *y,FP16_YYY *z)
+{
+    FP16_YYY t1,t2;
+
+    FP16_YYY_copy(r,x);
+    FP16_YYY_sub(&t1,w,y);
+    FP16_YYY_norm(&t1);
+    FP16_YYY_pmul(&t1,&t1,&(r->a));
+    FP16_YYY_add(&t2,w,y);
+    FP16_YYY_norm(&t2);
+    FP16_YYY_pmul(&t2,&t2,&(r->b));
+    FP16_YYY_times_i(&t2);
+
+    FP16_YYY_add(r,&t1,&t2);
+    FP16_YYY_add(r,r,z);
+
+    FP16_YYY_reduce(r);
+}
+
+/* XTR xtr_d function */
+void FP16_YYY_xtr_D(FP16_YYY *r,FP16_YYY *x)
+{
+    FP16_YYY w;
+    FP16_YYY_copy(r,x);
+    FP16_YYY_conj(&w,r);
+    FP16_YYY_add(&w,&w,&w);
+    FP16_YYY_sqr(r,r);
+    FP16_YYY_norm(&w);
+    FP16_YYY_sub(r,r,&w);
+    FP16_YYY_reduce(r);    /* reduce here as multiple calls trigger automatic reductions */
+}
+
+/* r=x^n using XTR method on traces of FP12s */
+void FP16_YYY_xtr_pow(FP16_YYY *r,FP16_YYY *x,BIG_XXX n)
+{
+    int i,par,nb;
+    BIG_XXX v;
+    FP2_YYY w2;
+    FP4_YYY w4;
+    FP8_YYY w8;
+    FP16_YYY t,a,b,c;
+
+    BIG_XXX_zero(v);
+    BIG_XXX_inc(v,3);
+    BIG_XXX_norm(v);
+    FP2_YYY_from_BIG(&w2,v);
+    FP4_YYY_from_FP2(&w4,&w2);
+    FP8_YYY_from_FP4(&w8,&w4);
+    FP16_YYY_from_FP8(&a,&w8);
+
+    FP16_YYY_copy(&b,x);
+    FP16_YYY_xtr_D(&c,x);
+
+    BIG_XXX_norm(n);
+    par=BIG_XXX_parity(n);
+    BIG_XXX_copy(v,n);
+    BIG_XXX_shr(v,1);
+    if (par==0)
+    {
+        BIG_XXX_dec(v,1);
+        BIG_XXX_norm(v);
+    }
+
+    nb=BIG_XXX_nbits(v);
+    for (i=nb-1; i>=0; i--)
+    {
+        if (!BIG_XXX_bit(v,i))
+        {
+            FP16_YYY_copy(&t,&b);
+            FP16_YYY_conj(x,x);
+            FP16_YYY_conj(&c,&c);
+            FP16_YYY_xtr_A(&b,&a,&b,x,&c);
+            FP16_YYY_conj(x,x);
+            FP16_YYY_xtr_D(&c,&t);
+            FP16_YYY_xtr_D(&a,&a);
+        }
+        else
+        {
+            FP16_YYY_conj(&t,&a);
+            FP16_YYY_xtr_D(&a,&b);
+            FP16_YYY_xtr_A(&b,&c,&b,x,&t);
+            FP16_YYY_xtr_D(&c,&c);
+        }
+    }
+
+    if (par==0) FP16_YYY_copy(r,&c);
+    else FP16_YYY_copy(r,&b);
+    FP16_YYY_reduce(r);
+}
+
+/* r=ck^a.cl^n using XTR double exponentiation method on traces of FP12s. See Stam thesis. */
+void FP16_YYY_xtr_pow2(FP16_YYY *r,FP16_YYY *ck,FP16_YYY *cl,FP16_YYY *ckml,FP16_YYY *ckm2l,BIG_XXX a,BIG_XXX b)
+{
+    int i,f2;
+    BIG_XXX d,e,w;
+    FP16_YYY t,cu,cv,cumv,cum2v;
+
+    BIG_XXX_norm(a);
+    BIG_XXX_norm(b);
+    BIG_XXX_copy(e,a);
+    BIG_XXX_copy(d,b);
+    FP16_YYY_copy(&cu,ck);
+    FP16_YYY_copy(&cv,cl);
+    FP16_YYY_copy(&cumv,ckml);
+    FP16_YYY_copy(&cum2v,ckm2l);
+
+    f2=0;
+    while (BIG_XXX_parity(d)==0 && BIG_XXX_parity(e)==0)
+    {
+        BIG_XXX_shr(d,1);
+        BIG_XXX_shr(e,1);
+        f2++;
+    }
+    while (BIG_XXX_comp(d,e)!=0)
+    {
+        if (BIG_XXX_comp(d,e)>0)
+        {
+            BIG_XXX_imul(w,e,4);
+            BIG_XXX_norm(w);
+            if (BIG_XXX_comp(d,w)<=0)
+            {
+                BIG_XXX_copy(w,d);
+                BIG_XXX_copy(d,e);
+                BIG_XXX_sub(e,w,e);
+                BIG_XXX_norm(e);
+                FP16_YYY_xtr_A(&t,&cu,&cv,&cumv,&cum2v);
+                FP16_YYY_conj(&cum2v,&cumv);
+                FP16_YYY_copy(&cumv,&cv);
+                FP16_YYY_copy(&cv,&cu);
+                FP16_YYY_copy(&cu,&t);
+            }
+            else if (BIG_XXX_parity(d)==0)
+            {
+                BIG_XXX_shr(d,1);
+                FP16_YYY_conj(r,&cum2v);
+                FP16_YYY_xtr_A(&t,&cu,&cumv,&cv,r);
+                FP16_YYY_xtr_D(&cum2v,&cumv);
+                FP16_YYY_copy(&cumv,&t);
+                FP16_YYY_xtr_D(&cu,&cu);
+            }
+            else if (BIG_XXX_parity(e)==1)
+            {
+                BIG_XXX_sub(d,d,e);
+                BIG_XXX_norm(d);
+                BIG_XXX_shr(d,1);
+                FP16_YYY_xtr_A(&t,&cu,&cv,&cumv,&cum2v);
+                FP16_YYY_xtr_D(&cu,&cu);
+                FP16_YYY_xtr_D(&cum2v,&cv);
+                FP16_YYY_conj(&cum2v,&cum2v);
+                FP16_YYY_copy(&cv,&t);
+            }
+            else
+            {
+                BIG_XXX_copy(w,d);
+                BIG_XXX_copy(d,e);
+                BIG_XXX_shr(d,1);
+                BIG_XXX_copy(e,w);
+                FP16_YYY_xtr_D(&t,&cumv);
+                FP16_YYY_conj(&cumv,&cum2v);
+                FP16_YYY_conj(&cum2v,&t);
+                FP16_YYY_xtr_D(&t,&cv);
+                FP16_YYY_copy(&cv,&cu);
+                FP16_YYY_copy(&cu,&t);
+            }
+        }
+        if (BIG_XXX_comp(d,e)<0)
+        {
+            BIG_XXX_imul(w,d,4);
+            BIG_XXX_norm(w);
+            if (BIG_XXX_comp(e,w)<=0)
+            {
+                BIG_XXX_sub(e,e,d);
+                BIG_XXX_norm(e);
+                FP16_YYY_xtr_A(&t,&cu,&cv,&cumv,&cum2v);
+                FP16_YYY_copy(&cum2v,&cumv);
+                FP16_YYY_copy(&cumv,&cu);
+                FP16_YYY_copy(&cu,&t);
+            }
+            else if (BIG_XXX_parity(e)==0)
+            {
+                BIG_XXX_copy(w,d);
+                BIG_XXX_copy(d,e);
+                BIG_XXX_shr(d,1);
+                BIG_XXX_copy(e,w);
+                FP16_YYY_xtr_D(&t,&cumv);
+                FP16_YYY_conj(&cumv,&cum2v);
+                FP16_YYY_conj(&cum2v,&t);
+                FP16_YYY_xtr_D(&t,&cv);
+                FP16_YYY_copy(&cv,&cu);
+                FP16_YYY_copy(&cu,&t);
+            }
+            else if (BIG_XXX_parity(d)==1)
+            {
+                BIG_XXX_copy(w,e);
+                BIG_XXX_copy(e,d);
+                BIG_XXX_sub(w,w,d);
+                BIG_XXX_norm(w);
+                BIG_XXX_copy(d,w);
+                BIG_XXX_shr(d,1);
+                FP16_YYY_xtr_A(&t,&cu,&cv,&cumv,&cum2v);
+                FP16_YYY_conj(&cumv,&cumv);
+                FP16_YYY_xtr_D(&cum2v,&cu);
+                FP16_YYY_conj(&cum2v,&cum2v);
+                FP16_YYY_xtr_D(&cu,&cv);
+                FP16_YYY_copy(&cv,&t);
+            }
+            else
+            {
+                BIG_XXX_shr(d,1);
+                FP16_YYY_conj(r,&cum2v);
+                FP16_YYY_xtr_A(&t,&cu,&cumv,&cv,r);
+                FP16_YYY_xtr_D(&cum2v,&cumv);
+                FP16_YYY_copy(&cumv,&t);
+                FP16_YYY_xtr_D(&cu,&cu);
+            }
+        }
+    }
+    FP16_YYY_xtr_A(r,&cu,&cv,&cumv,&cum2v);
+    for (i=0; i<f2; i++)	FP16_YYY_xtr_D(r,r);
+    FP16_YYY_xtr_pow(r,r,d);
+}
+
+#endif
+
+#ifdef HAS_MAIN
+int main()
+{
+    FP2 w0,w1,f;
+    FP8 w,t;
+    FP8 c1,c2,c3,c4,cr;
+    BIG a,b;
+    BIG e,e1,e2;
+    BIG p,md;
+
+    BIG_rcopy(md,Modulus);
+    //Test w^(P^4) = w mod p^2
+    BIG_zero(a);
+    BIG_inc(a,27);
+    BIG_zero(b);
+    BIG_inc(b,45);
+    FP2_from_BIGs(&w0,a,b);
+
+    BIG_zero(a);
+    BIG_inc(a,33);
+    BIG_zero(b);
+    BIG_inc(b,54);
+    FP2_from_BIGs(&w1,a,b);
+
+    FP8_from_FP2s(&w,&w0,&w1);
+    FP8_reduce(&w);
+
+    printf("w= ");
+    FP8_output(&w);
+    printf("\n");
+
+    FP8_copy(&t,&w);
+
+    BIG_copy(p,md);
+    FP8_pow(&w,&w,p);
+
+    printf("w^p= ");
+    FP8_output(&w);
+    printf("\n");
+
+    BIG_rcopy(a,CURVE_Fra);
+    BIG_rcopy(b,CURVE_Frb);
+    FP2_from_BIGs(&f,a,b);
+
+    FP8_frob(&t,&f);
+    printf("w^p= ");
+    FP8_output(&t);
+    printf("\n");
+
+    FP8_pow(&w,&w,p);
+    FP8_pow(&w,&w,p);
+    FP8_pow(&w,&w,p);
+    printf("w^p4= ");
+    FP8_output(&w);
+    printf("\n");
+
+    // Test 1/(1/x) = x mod p^4
+    FP8_from_FP2s(&w,&w0,&w1);
+    printf("Test Inversion \nw= ");
+    FP8_output(&w);
+    printf("\n");
+
+    FP8_inv(&w,&w);
+    printf("1/w mod p^4 = ");
+    FP8_output(&w);
+    printf("\n");
+
+    FP8_inv(&w,&w);
+    printf("1/(1/w) mod p^4 = ");
+    FP8_output(&w);
+    printf("\n");
+
+    BIG_zero(e);
+    BIG_inc(e,12);
+
+    //	FP8_xtr_A(&w,&t,&w,&t,&t);
+    FP8_xtr_pow(&w,&w,e);
+
+    printf("w^e= ");
+    FP8_output(&w);
+    printf("\n");
+
+    BIG_zero(a);
+    BIG_inc(a,37);
+    BIG_zero(b);
+    BIG_inc(b,17);
+    FP2_from_BIGs(&w0,a,b);
+
+    BIG_zero(a);
+    BIG_inc(a,49);
+    BIG_zero(b);
+    BIG_inc(b,31);
+    FP2_from_BIGs(&w1,a,b);
+
+    FP8_from_FP2s(&c1,&w0,&w1);
+    FP8_from_FP2s(&c2,&w0,&w1);
+    FP8_from_FP2s(&c3,&w0,&w1);
+    FP8_from_FP2s(&c4,&w0,&w1);
+
+    BIG_zero(e1);
+    BIG_inc(e1,3331);
+    BIG_zero(e2);
+    BIG_inc(e2,3372);
+
+    FP8_xtr_pow2(&w,&c1,&w,&c2,&c3,e1,e2);
+
+    printf("c^e= ");
+    FP8_output(&w);
+    printf("\n");
+
+    return 0;
+}
+#endif
+