You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@milagro.apache.org by sa...@apache.org on 2020/03/25 09:51:21 UTC

[incubator-milagro-crypto-c] branch issue74-review-ct created (now 55d9037)

This is an automated email from the ASF dual-hosted git repository.

sandreoli pushed a change to branch issue74-review-ct
in repository https://gitbox.apache.org/repos/asf/incubator-milagro-crypto-c.git.


      at 55d9037  make invmodp CT

This branch includes the following new commits:

     new 5e80912  make comparison and others ct
     new 55d9037  make invmodp CT

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[incubator-milagro-crypto-c] 01/02: make comparison and others ct

Posted by sa...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

sandreoli pushed a commit to branch issue74-review-ct
in repository https://gitbox.apache.org/repos/asf/incubator-milagro-crypto-c.git

commit 5e809128a6186f80992bd6625d1afa625e0573a6
Author: Samuele Andreoli <sa...@yahoo.it>
AuthorDate: Wed Mar 25 00:50:22 2020 +0000

    make comparison and others ct
---
 include/big.h.in |  6 +++---
 src/big.c.in     | 56 ++++++++++++++++++++++++++++++++------------------------
 src/ff.c.in      | 56 ++++++++++++++++++++++++++++++++++++++------------------
 src/wcc.c.in     |  2 ++
 src/wcc192.c.in  |  2 ++
 src/wcc256.c.in  |  2 ++
 6 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/include/big.h.in b/include/big.h.in
index 7e89b36..f4c7582 100644
--- a/include/big.h.in
+++ b/include/big.h.in
@@ -61,19 +61,19 @@ typedef chunk DBIG_XXX[DNLEN_XXX];   /**< Define type DBIG as array of chunks */
 
 /* BIG number prototypes */
 
-/**	@brief Tests for BIG equal to zero
+/**	@brief Tests for BIG equal to zero - input must be normalised
  *
 	@param x a BIG number
 	@return 1 if zero, else returns 0
  */
 extern int BIG_XXX_iszilch(BIG_XXX x);
-/**	@brief Tests for BIG equal to one
+/**	@brief Tests for BIG equal to one - input must be normalised
  *
 	@param x a BIG number
 	@return 1 if one, else returns 0
  */
 extern int BIG_XXX_isunity(BIG_XXX x);
-/**	@brief Tests for DBIG equal to zero
+/**	@brief Tests for DBIG equal to zero - input must be normalised
  *
 	@param x a DBIG number
 	@return 1 if zero, else returns 0
diff --git a/src/big.c.in b/src/big.c.in
index 3a7980b..c8c4929 100644
--- a/src/big.c.in
+++ b/src/big.c.in
@@ -26,28 +26,36 @@
 int BIG_XXX_iszilch(BIG_XXX a)
 {
     int i;
+    chunk d = 0;
+
     for (i=0; i<NLEN_XXX; i++)
-        if (a[i]!=0) return 0;
-    return 1;
+        d |= a[i];
+
+    return (1 & ((d-1)>>BASEBITS_XXX));
 }
 
 /* test a=1? */
 int BIG_XXX_isunity(BIG_XXX a)
 {
     int i;
+    chunk d = 0;
+
     for(i=1; i<NLEN_XXX; i++)
-        if (a[i]!=0) return 0;
-    if (a[0]!=1) return 0;
-    return 1;
+        d |= a[i];
+
+    return (1 & ((d-1)>>BASEBITS_XXX) & ((a[0]^1)-1)>>BASEBITS_XXX);
 }
 
 /* test a=0? */
 int BIG_XXX_diszilch(DBIG_XXX a)
 {
     int i;
+    chunk d = 0;
+
     for (i=0; i<DNLEN_XXX; i++)
-        if (a[i]!=0) return 0;
-    return 1;
+        d |= a[i];
+
+    return (1 & ((d-1)>>BASEBITS_XXX));
 }
 
 /* SU= 56 */
@@ -785,12 +793,7 @@ void BIG_XXX_monty(BIG_XXX a,BIG_XXX md,chunk MC,DBIG_XXX d)
     chunk m,carry;
     for (i=0; i<NLEN_XXX; i++)
     {
-        if (MC==-1) m=(-d[i])&BMASK_XXX;
-        else
-        {
-            if (MC==1) m=d[i];
-            else m=(MC*d[i])&BMASK_XXX;
-        }
+        m = (MC*d[i])&BMASK_XXX;
         carry=0;
         for (j=0; j<NLEN_XXX; j++)
             carry=muladd_XXX(m,md[j],carry,&d[i+j]);
@@ -1014,25 +1017,31 @@ void BIG_XXX_dnorm(DBIG_XXX a)
 int BIG_XXX_comp(BIG_XXX a,BIG_XXX b)
 {
     int i;
-    for (i=NLEN_XXX-1; i>=0; i--)
+    chunk gt = 0;
+    chunk eq = 1;
+
+    for (i = NLEN_XXX-1; i>=0; i--)
     {
-        if (a[i]==b[i]) continue;
-        if (a[i]>b[i]) return 1;
-        else  return -1;
+        gt |= ((b[i]-a[i]) >> BASEBITS_XXX) & eq;
+        eq &= ((b[i]^a[i])-1) >> BASEBITS_XXX;
     }
-    return 0;
+
+    return (int)(gt+gt+eq-1);
 }
 
 int BIG_XXX_dcomp(DBIG_XXX a,DBIG_XXX b)
 {
     int i;
+    chunk gt = 0;
+    chunk eq = 1;
+
     for (i=DNLEN_XXX-1; i>=0; i--)
     {
-        if (a[i]==b[i]) continue;
-        if (a[i]>b[i]) return 1;
-        else  return -1;
+        gt |= ((b[i]-a[i]) >> BASEBITS_XXX) & eq;
+        eq &= ((b[i]^a[i])-1) >> BASEBITS_XXX;
     }
-    return 0;
+
+    return (int)(gt+gt+eq-1);
 }
 
 /* return number of bits in a */
@@ -1231,8 +1240,7 @@ int BIG_XXX_parity(BIG_XXX a)
 /* SU= 16 */
 int BIG_XXX_bit(BIG_XXX a,int n)
 {
-    if (a[n/BASEBITS_XXX]&((chunk)1<<(n%BASEBITS_XXX))) return 1;
-    else return 0;
+    return ((int)(a[n/BASEBITS_XXX]>>(n%BASEBITS_XXX))) & 1;
 }
 
 /* return last n bits of a, where n is small < BASEBITS */
diff --git a/src/ff.c.in b/src/ff.c.in
index 3f83bc2..50a2e81 100644
--- a/src/ff.c.in
+++ b/src/ff.c.in
@@ -71,9 +71,12 @@ void FF_WWW_zero(BIG_XXX x[],int n)
 int FF_WWW_iszilch(BIG_XXX x[],int n)
 {
     int i;
+    int rc = 1;
+
     for (i=0; i<n; i++)
-        if (!BIG_XXX_iszilch(x[i])) return 0;
-    return 1;
+        rc &= BIG_XXX_iszilch(x[i]);
+
+    return rc;
 }
 
 /* shift right by BIGBITS-bit words */
@@ -137,13 +140,19 @@ void FF_WWW_init(BIG_XXX x[],sign32 m,int n)
 /* compare x and y - must be normalised */
 int FF_WWW_comp(BIG_XXX x[],BIG_XXX y[],int n)
 {
-    int i,j;
+    int i;
+    int c;
+    int eq = 1;
+    int gt = 0;
+
     for (i=n-1; i>=0; i--)
     {
-        j=BIG_XXX_comp(x[i],y[i]);
-        if (j!=0) return j;
+        c = BIG_XXX_comp(x[i],y[i]);
+        gt += eq * (c * c + c);
+        eq *= 1 - c * c;
     }
-    return 0;
+
+    return gt + eq - 1;
 }
 
 /* recursive add */
@@ -305,6 +314,15 @@ static void FF_WWW_cswap(BIG_XXX a[],BIG_XXX b[],int d,int n)
     return;
 }
 
+/* copy b to a - side channel resistant */
+static void FF_WWW_cmove(BIG_XXX a[],BIG_XXX b[],int d,int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+        BIG_XXX_cmove(a[i],b[i],d);
+    return;
+}
+
 /* z=x*y, t is workspace */
 static void FF_WWW_karmul(BIG_XXX z[],int zp,BIG_XXX x[],int xp,BIG_XXX y[],int yp,BIG_XXX t[],int tp,int n)
 {
@@ -425,6 +443,12 @@ void FF_WWW_mod(BIG_XXX b[],BIG_XXX c[],int n)
 {
     int k=0;
 
+#ifndef C99
+    BIG_XXX r[FFLEN_WWW];
+#else
+    BIG_XXX r[n];
+#endif
+
     FF_WWW_norm(b,n);
     if (FF_WWW_comp(b,c,n)<0)
         return;
@@ -438,11 +462,9 @@ void FF_WWW_mod(BIG_XXX b[],BIG_XXX c[],int n)
     while (k>0)
     {
         FF_WWW_shr(c,n);
-        if (FF_WWW_comp(b,c,n)>=0)
-        {
-            FF_WWW_sub(b,b,c,n);
-            FF_WWW_norm(b,n);
-        }
+        FF_WWW_sub(r,b,c,n);
+        FF_WWW_norm(r,n);
+        FF_WWW_cmove(b,r,FF_WWW_comp(b,c,n)>=0,n);
         k--;
     }
 }
@@ -488,9 +510,11 @@ void FF_WWW_dmod(BIG_XXX r[],BIG_XXX a[],BIG_XXX b[],int n)
 {
     int k;
 #ifndef C99
+    BIG_XXX s[2*FFLEN_WWW];
     BIG_XXX m[2*FFLEN_WWW];
     BIG_XXX x[2*FFLEN_WWW];
 #else
+    BIG_XXX s[2*n];
     BIG_XXX m[2*n];
     BIG_XXX x[2*n];
 #endif
@@ -508,13 +532,9 @@ void FF_WWW_dmod(BIG_XXX r[],BIG_XXX a[],BIG_XXX b[],int n)
     while (k>0)
     {
         FF_WWW_shr(m,2*n);
-
-        if (FF_WWW_comp(x,m,2*n)>=0)
-        {
-            FF_WWW_sub(x,x,m,2*n);
-            FF_WWW_norm(x,2*n);
-        }
-
+        FF_WWW_sub(s,x,m,2*n);
+        FF_WWW_norm(s,2*n);
+        FF_WWW_cmove(x,s,FF_WWW_comp(x,m,2*n)>=0,2*n);
         k--;
     }
     FF_WWW_copy(r,x,n);
diff --git a/src/wcc.c.in b/src/wcc.c.in
index 4b42a43..1b5ef03 100644
--- a/src/wcc.c.in
+++ b/src/wcc.c.in
@@ -155,6 +155,7 @@ int WCC_ZZZ_SENDER_KEY(int sha, octet *xOct, octet *piaOct, octet *pibOct, octet
 
     // z =  x + pia
     BIG_XXX_add(z,x,pia);
+    BIG_XXX_norm(z);
 
     // (x+pia).AKeyG1
     PAIR_ZZZ_G1mul(&sAG1,z);
@@ -244,6 +245,7 @@ int WCC_ZZZ_RECEIVER_KEY(int sha, octet *yOct, octet *wOct,  octet *piaOct, octe
 
     // y =  y + pib
     BIG_XXX_add(y,y,pib);
+    BIG_XXX_norm(y);
 
     // (y+pib).BKeyG2
     PAIR_ZZZ_G2mul(&sBG2,y);
diff --git a/src/wcc192.c.in b/src/wcc192.c.in
index f08eae7..e11ecd4 100644
--- a/src/wcc192.c.in
+++ b/src/wcc192.c.in
@@ -155,6 +155,7 @@ int WCC_ZZZ_SENDER_KEY(int sha, octet *xOct, octet *piaOct, octet *pibOct, octet
 
     // z =  x + pia
     BIG_XXX_add(z,x,pia);
+    BIG_XXX_norm(z);
 
     // (x+pia).AKeyG1
     PAIR_ZZZ_G1mul(&sAG1,z);
@@ -256,6 +257,7 @@ int WCC_ZZZ_RECEIVER_KEY(int sha, octet *yOct, octet *wOct,  octet *piaOct, octe
 
     // y =  y + pib
     BIG_XXX_add(y,y,pib);
+    BIG_XXX_norm(y);
 
     // (y+pib).BKeyG2
     PAIR_ZZZ_G2mul(&sBG2,y);
diff --git a/src/wcc256.c.in b/src/wcc256.c.in
index 0e1d345..374adcb 100644
--- a/src/wcc256.c.in
+++ b/src/wcc256.c.in
@@ -155,6 +155,7 @@ int WCC_ZZZ_SENDER_KEY(int sha, octet *xOct, octet *piaOct, octet *pibOct, octet
 
     // z =  x + pia
     BIG_XXX_add(z,x,pia);
+    BIG_XXX_norm(z);
 
     // (x+pia).AKeyG1
     PAIR_ZZZ_G1mul(&sAG1,z);
@@ -279,6 +280,7 @@ int WCC_ZZZ_RECEIVER_KEY(int sha, octet *yOct, octet *wOct,  octet *piaOct, octe
 
     // y =  y + pib
     BIG_XXX_add(y,y,pib);
+    BIG_XXX_norm(y);
 
     // (y+pib).BKeyG2
     PAIR_ZZZ_G2mul(&sBG2,y);

[incubator-milagro-crypto-c] 02/02: make invmodp CT

Posted by sa...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

sandreoli pushed a commit to branch issue74-review-ct
in repository https://gitbox.apache.org/repos/asf/incubator-milagro-crypto-c.git

commit 55d903760cb1b9bd04098a072f2f28130a9e2ce7
Author: Samuele Andreoli <sa...@yahoo.it>
AuthorDate: Wed Mar 25 09:41:03 2020 +0000

    make invmodp CT
---
 src/big.c.in | 131 +++++++++++++++++++++++++++++++--------------------------
 src/ff.c.in  | 134 +++++++++++++++++++++++++++++++++--------------------------
 2 files changed, 147 insertions(+), 118 deletions(-)

diff --git a/src/big.c.in b/src/big.c.in
index c8c4929..6bc18f2 100644
--- a/src/big.c.in
+++ b/src/big.c.in
@@ -1351,6 +1351,7 @@ void BIG_XXX_moddiv(BIG_XXX r,BIG_XXX a1,BIG_XXX b1,BIG_XXX m)
     BIG_XXX_copy(b,b1);
 
     BIG_XXX_mod(a,m);
+    BIG_XXX_mod(b,m);
     BIG_XXX_invmodp(z,b,m);
 
     BIG_XXX_mul(d,a,z);
@@ -1479,69 +1480,83 @@ void BIG_XXX_invmod2m(BIG_XXX a)
     BIG_XXX_mod2m(a,BIGBITS_XXX);
 }
 
-/* Set r=1/a mod p. Binary method */
-/* SU= 240 */
+/* Set r=1/a mod p. Kaliski method - on entry a < p*/
 void BIG_XXX_invmodp(BIG_XXX r,BIG_XXX a,BIG_XXX p)
 {
-    BIG_XXX u,v,x1,x2,t,one;
-    BIG_XXX_mod(a,p);
-    BIG_XXX_copy(u,a);
-    BIG_XXX_copy(v,p);
-    BIG_XXX_one(one);
-    BIG_XXX_copy(x1,one);
-    BIG_XXX_zero(x2);
+    int k, p1, pu, pv, psw, pmv;
+    BIG_XXX u, v, s, w;
+
+    BIG_XXX_copy(u, p);
+    BIG_XXX_copy(v,a);
+    BIG_XXX_zero(r);
+    BIG_XXX_one(s);
 
-    while (BIG_XXX_comp(u,one)!=0 && BIG_XXX_comp(v,one)!=0)
+    // v = a2^BIGBITS_XXX mod p
+    for (k = 0; k < BIGBITS_XXX; k++)
     {
-        while (BIG_XXX_parity(u)==0)
-        {
-            BIG_XXX_fshr(u,1);
-            if (BIG_XXX_parity(x1)!=0)
-            {
-                BIG_XXX_add(x1,p,x1);
-                BIG_XXX_norm(x1);
-            }
-            BIG_XXX_fshr(x1,1);
-        }
-        while (BIG_XXX_parity(v)==0)
-        {
-            BIG_XXX_fshr(v,1);
-            if (BIG_XXX_parity(x2)!=0)
-            {
-                BIG_XXX_add(x2,p,x2);
-                BIG_XXX_norm(x2);
-            }
-            BIG_XXX_fshr(x2,1);
-        }
-        if (BIG_XXX_comp(u,v)>=0)
-        {
-            BIG_XXX_sub(u,u,v);
-            BIG_XXX_norm(u);
-            if (BIG_XXX_comp(x1,x2)>=0) BIG_XXX_sub(x1,x1,x2);
-            else
-            {
-                BIG_XXX_sub(t,p,x2);
-                BIG_XXX_add(x1,x1,t);
-            }
-            BIG_XXX_norm(x1);
-        }
-        else
-        {
-            BIG_XXX_sub(v,v,u);
-            BIG_XXX_norm(v);
-            if (BIG_XXX_comp(x2,x1)>=0) BIG_XXX_sub(x2,x2,x1);
-            else
-            {
-                BIG_XXX_sub(t,p,x1);
-                BIG_XXX_add(x2,x2,t);
-            }
-            BIG_XXX_norm(x2);
-        }
+        BIG_XXX_sub(w, v, p);
+        BIG_XXX_norm(w);
+        BIG_XXX_cmove(v, w, (BIG_XXX_comp(v, p) > 0));
+        BIG_XXX_fshl(v, 1);
+    }
+
+    // CT Kaliski almost inverse
+    // The correction step is included
+    for (k = 0; k < 2 * BIGBITS_XXX; k++)
+    {
+        p1 = !BIG_XXX_iszilch(v);
+        
+        pu = BIG_XXX_parity(u);
+        pv = BIG_XXX_parity(v);
+        // Cases 2-4 of Kaliski
+        psw = p1 & ((!pu) | (pv & (BIG_XXX_comp(u,v)>0)));
+        // Cases 3-4 of Kaliski
+        pmv = p1 & pu & pv;
+
+        // Swap necessary for cases 2-4 of Kaliski
+        BIG_XXX_cswap(u, v, psw);
+        BIG_XXX_cswap(r, s, psw);
+
+        // Addition and subtraction for cases 3-4 of Kaliski
+        BIG_XXX_sub(w, v, u);
+        BIG_XXX_norm(w);
+        BIG_XXX_cmove(v, w, pmv);
+
+        BIG_XXX_add(w, r, s);
+        BIG_XXX_norm(w);
+        BIG_XXX_cmove(s, w, pmv);
+
+        // Subtraction for correction step
+        BIG_XXX_sub(w, r, p);
+        BIG_XXX_norm(w);
+        BIG_XXX_cmove(r, w, (!p1) & (BIG_XXX_comp(r, p) > 0));
+
+        // Shifts for all Kaliski cases and correction step
+        BIG_XXX_fshl(r, 1);
+        BIG_XXX_fshr(v, 1);
+
+        // Restore u,v,r,s to the original position
+        BIG_XXX_cswap(u, v, psw);
+        BIG_XXX_cswap(r, s, psw);        
+    }
+
+    // Last step of kaliski
+    // Moved after the correction step
+    BIG_XXX_sub(w, r, p);
+    BIG_XXX_norm(w);
+    BIG_XXX_cmove(r, w, (BIG_XXX_comp(r,p)>0));
+
+    BIG_XXX_sub(r, p, r);
+    BIG_XXX_norm(r);
+
+    // Restore inverse from Montgomery form
+    for (k = 0; k < BIGBITS_XXX; k++)
+    {
+        BIG_XXX_add(w, r, p);
+        BIG_XXX_norm(w);
+        BIG_XXX_cmove(r, w, BIG_XXX_parity(r));
+        BIG_XXX_fshr(r, 1);
     }
-    if (BIG_XXX_comp(u,one)==0)
-        BIG_XXX_copy(r,x1);
-    else
-        BIG_XXX_copy(r,x2);
 }
 
 /* set x = x mod 2^m */
diff --git a/src/ff.c.in b/src/ff.c.in
index 50a2e81..4b678f3 100644
--- a/src/ff.c.in
+++ b/src/ff.c.in
@@ -541,74 +541,88 @@ void FF_WWW_dmod(BIG_XXX r[],BIG_XXX a[],BIG_XXX b[],int n)
     FF_WWW_mod(r,b,n);
 }
 
-/* Set r=1/a mod p. Binary method - a<p on entry */
-
+/* Set r=1/a mod p. Kaliski method - a<p on entry */
 void FF_WWW_invmodp(BIG_XXX r[],BIG_XXX a[],BIG_XXX p[],int n)
 {
 #ifndef C99
-    BIG_XXX u[FFLEN_WWW],v[FFLEN_WWW],x1[FFLEN_WWW],x2[FFLEN_WWW],t[FFLEN_WWW],one[FFLEN_WWW];
+    BIG_XXX u[FFLEN_WWW],v[FFLEN_WWW],s[FFLEN_WWW],w[FFLEN_WWW];
 #else
-    BIG_XXX u[n],v[n],x1[n],x2[n],t[n],one[n];
+    BIG_XXX u[n],v[n],s[n],w[n];
 #endif
-    FF_WWW_copy(u,a,n);
-    FF_WWW_copy(v,p,n);
-    FF_WWW_one(one,n);
-    FF_WWW_copy(x1,one,n);
-    FF_WWW_zero(x2,n);
-
-    // reduce n in here as well!
-    while (FF_WWW_comp(u,one,n)!=0 && FF_WWW_comp(v,one,n)!=0)
+
+    int k, p1, pu, pv, psw, pmv;
+
+    FF_WWW_copy(u, p, n);
+    FF_WWW_copy(v, a, n);
+    FF_WWW_zero(r, n);
+    FF_WWW_one(s, n);
+
+    // v = a2^(n*BIGBITS_XXX) mod p
+    for(k = 0; k < n*BIGBITS_XXX; k++)
     {
-        while (FF_WWW_parity(u)==0)
-        {
-            FF_WWW_shr(u,n);
-            if (FF_WWW_parity(x1)!=0)
-            {
-                FF_WWW_add(x1,p,x1,n);
-                FF_WWW_norm(x1,n);
-            }
-            FF_WWW_shr(x1,n);
-        }
-        while (FF_WWW_parity(v)==0)
-        {
-            FF_WWW_shr(v,n);
-            if (FF_WWW_parity(x2)!=0)
-            {
-                FF_WWW_add(x2,p,x2,n);
-                FF_WWW_norm(x2,n);
-            }
-            FF_WWW_shr(x2,n);
-        }
-        if (FF_WWW_comp(u,v,n)>=0)
-        {
+        FF_WWW_sub(w, v, p, n);
+        FF_WWW_norm(w, n);
+        FF_WWW_cmove(v, w, (FF_WWW_comp(v, p, n) > 0), n);
+        FF_WWW_shl(v, n);
+    }
 
-            FF_WWW_sub(u,u,v,n);
-            FF_WWW_norm(u,n);
-            if (FF_WWW_comp(x1,x2,n)>=0) FF_WWW_sub(x1,x1,x2,n);
-            else
-            {
-                FF_WWW_sub(t,p,x2,n);
-                FF_WWW_add(x1,x1,t,n);
-            }
-            FF_WWW_norm(x1,n);
-        }
-        else
-        {
-            FF_WWW_sub(v,v,u,n);
-            FF_WWW_norm(v,n);
-            if (FF_WWW_comp(x2,x1,n)>=0) FF_WWW_sub(x2,x2,x1,n);
-            else
-            {
-                FF_WWW_sub(t,p,x1,n);
-                FF_WWW_add(x2,x2,t,n);
-            }
-            FF_WWW_norm(x2,n);
-        }
+    // CT Kaliski almost inverse
+    // The correction step is included
+    for (k = 0; k < 2*n*BIGBITS_XXX; k++)
+    {
+        p1 = !FF_WWW_iszilch(v, n);
+
+        pu = FF_WWW_parity(u);
+        pv = FF_WWW_parity(v);
+        // Cases 2-4 of Kaliski
+        psw = p1 & ((!pu) | (pv & (FF_WWW_comp(u, v, n)>0)));
+        // Cases 3-4 of Kaliski
+        pmv = p1 & pu & pv;
+
+        // Swap for cases 2-4 of Kaliski
+        FF_WWW_cswap(u, v, psw, n);
+        FF_WWW_cswap(r, s, psw, n);
+
+        // Addition and subtraction for cases 3-4 of Kaliski
+        FF_WWW_sub(w, v, u, n);
+        FF_WWW_norm(w, n);
+        FF_WWW_cmove(v, w, pmv, n);
+
+        FF_WWW_add(w, r, s, n);
+        FF_WWW_norm(w, n);
+        FF_WWW_cmove(s, w, pmv, n);
+
+        // Subtraction for correction step
+        FF_WWW_sub(w, r, p, n);
+        FF_WWW_norm(w, n);
+        FF_WWW_cmove(r, w, (!p1) & (FF_WWW_comp(r, p, n)>0), n);
+
+        // Shifts for all Kaliski cases and correction step
+        FF_WWW_shl(r, n);
+        FF_WWW_shr(v, n);
+
+        // Restore u,v,r,s to the original position
+        FF_WWW_cswap(u, v, psw, n);
+        FF_WWW_cswap(r, s, psw, n);
+    }
+
+    // Last step of Kaliski
+    // Moved after the correction step
+    FF_WWW_sub(w, r, p, n);
+    FF_WWW_norm(w, n);
+    FF_WWW_cmove(r, w, (FF_WWW_comp(r, p, n)>0), n);
+    
+    FF_WWW_sub(r, p, r, n);
+    FF_WWW_norm(r, n);
+
+    // Restore inverse from Montgomery form
+    for (k = 0; k < n*BIGBITS_XXX; k++)
+    {
+        FF_WWW_add(w, r, p, n);
+        FF_WWW_norm(w, n);
+        FF_WWW_cmove(r, w, FF_WWW_parity(r), n);
+        FF_WWW_shr(r, n);
     }
-    if (FF_WWW_comp(u,one,n)==0)
-        FF_WWW_copy(r,x1,n);
-    else
-        FF_WWW_copy(r,x2,n);
 }
 
 /* nesidue mod m */