ecm-6.4.4/0000755023561000001540000000000012113421640007265 500000000000000ecm-6.4.4/median.c0000644023561000001540000005030012106741273010615 00000000000000/* Median/middle product. Copyright 2003, 2004, 2005, 2006, 2007, 2008 Laurent Fousse, Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ /* Reference: [1] Tellegen's Principle into Practice, by A. Bostan, G. Lecerf and E. Schost, Proc. of ISSAC'03, Philadelphia, 2003. */ #include #include "ecm-impl.h" #ifndef MAX #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #endif #ifndef MIN #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #endif extern unsigned int Fermat; static void list_add_wrapper (listz_t, listz_t, listz_t, unsigned int, unsigned int); static void list_sub_wrapper (listz_t, listz_t, listz_t, unsigned int, unsigned int); static unsigned int TKarMul (listz_t, unsigned int, listz_t, unsigned int, listz_t, unsigned int, listz_t); static void list_sub_safe (listz_t, listz_t, listz_t, unsigned int, unsigned int, unsigned int); static void list_add_safe (listz_t, listz_t, listz_t, unsigned int, unsigned int, unsigned int); static unsigned int TToomCookMul (listz_t, unsigned int, listz_t, unsigned int, listz_t, unsigned int, listz_t); static unsigned int TToomCookMul_space (unsigned int, unsigned int, unsigned int); static void list_add_wrapper (listz_t p, listz_t q, listz_t r, unsigned int n, unsigned int max_r) { list_add (p, q, r, MIN (n, max_r)); if (n > max_r) list_set (p + max_r, q + max_r, n - max_r); } static void list_sub_wrapper (listz_t p, listz_t q, listz_t r, unsigned int n, unsigned int max_r) { list_sub (p, q, r, MIN (n, max_r)); if (n > max_r) list_set (p + max_r, q + max_r, n - max_r); } /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients of degree m to n+m of rev(a)*c, i.e. b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l) ... b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k) ... b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n]. Using auxiliary memory in t. Implements algorithm TKarMul of [1]. Assumes deg(c) = l <= m+n. */ static unsigned int TKarMul (listz_t b, unsigned int n, listz_t a, unsigned int m, listz_t c, unsigned int l, listz_t t) { unsigned int k, mu, nu, h; unsigned int s1; unsigned tot_muls = 0; #ifdef DEBUG fprintf (ECM_STDOUT, "Enter TKarMul.\nm = %d\nn = %d\nl = %d\n", m, n, l); fprintf (ECM_STDOUT, "a = "); print_list (a, m + 1); fprintf (ECM_STDOUT, "\nc = "); print_list (c, l + 1); fprintf (ECM_STDOUT, "\n"); #endif if (n == 0) { #ifdef DEBUG fprintf (ECM_STDOUT, "Case n = 0.\n"); #endif mpz_mul (b[0], a[0], c[0]); for (k = 1; (k <= m) && (k <= l); k++) mpz_addmul (b[0], a[k], c[k]); #ifdef DEBUG fprintf (ECM_STDOUT, "Exit TKarMul.\n"); #endif return MIN (m, l) + 1; } if (m == 0) { #ifdef DEBUG fprintf (ECM_STDOUT, "Case m = 0.\n"); #endif for (k = 0; (k <= l) && (k <= n); k++) mpz_mul (b[k], a[0], c[k]); for (k = l + 1; k <= n; k++) mpz_set_ui (b[k], 0); #ifdef DEBUG fprintf (ECM_STDOUT, "Exit TKarMul.\n"); #endif return MIN (n, l) + 1; } mu = (m / 2) + 1; /* 1 <= mu <= m */ nu = (n / 2) + 1; /* 1 <= nu <= n */ h = MAX (mu, nu); /* h >= 1 */ #ifdef DEBUG fprintf (ECM_STDOUT, "mu = %d\nnu = %d\nh = %d\n", mu, nu, h); #endif if (mu > n) { #ifdef DEBUG fprintf (ECM_STDOUT, "Case mu > n.\n"); #endif tot_muls += TKarMul (b, n, a, mu - 1, c, l, t); if (l >= mu) { /* we have to check l-mu <= n + (m-mu), i.e. l <= n+m */ tot_muls += TKarMul (t, n, a + mu, m - mu, c + mu, l - mu, t + n + 1); list_add (b, b, t, n + 1); } #ifdef DEBUG fprintf (ECM_STDOUT, "Exit TKarMul.\n"); #endif return tot_muls; } if (nu > m) { #ifdef DEBUG fprintf (ECM_STDOUT, "Case nu > m.\n"); #endif /* we have to check MIN(l,m+nu-1) <= nu-1+m: trivial */ tot_muls += TKarMul (b, nu - 1, a, m, c, MIN (l, m + nu - 1), t); /* Description broken in reference. Should be a list * concatenation, not an addition. * Fixed now. */ if (l >= nu) { /* we have to check l-nu <= n-nu+m, i.e. l <= n+m: trivial */ tot_muls += TKarMul (b + nu, n - nu, a, m, c + nu, l - nu, t); } else list_zero (b + nu, n - nu + 1); #ifdef DEBUG fprintf (ECM_STDOUT, "Exit TKarMul.\n"); #endif return tot_muls; } /* We want nu = mu */ mu = nu = h; #ifdef DEBUG fprintf (ECM_STDOUT, "Base Case.\n"); #endif s1 = MIN (l + 1, n + mu); if (l + 1 > nu) list_sub_wrapper (t, c, c + nu, s1, l - nu + 1); else list_set (t, c, s1); #ifdef DEBUG fprintf (ECM_STDOUT, "DEBUG c - c[nu].\n"); print_list (t, s1); fprintf (ECM_STDOUT, "We compute (1) - (3)\n"); #endif tot_muls += TKarMul (b, nu - 1, a, mu - 1, t, s1 - 1, t + s1); /* (1) - (3) */ #ifdef DEBUG print_list (b, nu); fprintf (ECM_STDOUT, "We compute (2) - (4)\n"); #endif if (s1 >= nu + 1) { /* nu - 1 */ tot_muls += TKarMul (b + nu, n - nu, a + mu, m - mu, t + nu, s1 - nu - 1, t + s1); /* (2) - (4) */ } else { list_zero (b + nu, n - nu + 1); } #ifdef DEBUG print_list (b + nu, n - nu + 1); #endif list_add_wrapper (t, a, a + mu, mu, m + 1 - mu); #ifdef DEBUG fprintf (ECM_STDOUT, "We compute (2) + (3)\n"); #endif if (l >= nu) { tot_muls += TKarMul (t + mu, nu - 1, t, mu - 1, c + nu, l - nu, t + mu + nu); } else list_zero (t + mu, nu); /* (2) + (3) */ #ifdef DEBUG print_list (t + mu, nu); #endif list_add (b, b, t + mu, nu); list_sub (b + nu, t + mu, b + nu, n - nu + 1); return tot_muls; } /* Computes the space needed for TKarMul of b[0..n], * a[0..m] and c[0..l] */ static unsigned int TKarMul_space (unsigned int n, unsigned int m, unsigned int l) { unsigned int mu, nu, h; unsigned int s1; unsigned int r1, r2; if (n == 0) return 0; if (m == 0) return 0; mu = (m / 2) + 1; nu = (n / 2) + 1; h = MAX (mu, nu); if (mu > n) { r1 = TKarMul_space (n, mu - 1, l); if (l >= mu) { r2 = TKarMul_space (n, m - mu, l - mu) + n + 1; r1 = MAX (r1, r2); } return r1; } if (nu > m) { r1 = TKarMul_space (nu - 1, m, MIN (l, m + nu - 1)); if (l >= nu) { r2 = TKarMul_space (n - nu, m,l - nu); r1 = MAX (r1, r2); } return r1; } mu = nu = h; s1 = MIN (l + 1, n + mu); r1 = TKarMul_space (nu - 1, mu - 1, s1 - 1) + s1; if (s1 >= nu + 1) { r2 = TKarMul_space (n - nu, m - mu, s1 - nu - 1) + s1; r1 = MAX (r1, r2); } if (l >= nu) { r2 = TKarMul_space (nu - 1, mu - 1, l - nu) + mu + nu; r1 = MAX (r1, r2); } return r1; } /* list_sub with bound checking */ static void list_sub_safe (listz_t ret, listz_t a, listz_t b, unsigned int sizea, unsigned int sizeb, unsigned int needed) { unsigned int i; unsigned int safe; safe = MIN(sizea, sizeb); safe = MIN(safe, needed); list_sub (ret, a, b, safe); i = safe; while (i < needed) { if (i < sizea) { if (i < sizeb) mpz_sub (ret[i], a[i], b[i]); else mpz_set (ret[i], a[i]); } else { if (i < sizeb) mpz_neg (ret[i], b[i]); else mpz_set_ui (ret[i], 0); } i++; } } /* list_add with bound checking */ static void list_add_safe (listz_t ret, listz_t a, listz_t b, unsigned int sizea, unsigned int sizeb, unsigned int needed) { unsigned int i; unsigned int safe; safe = MIN(sizea, sizeb); safe = MIN(safe, needed); list_add (ret, a, b, i = safe); while (i < needed) { if (i < sizea) { if (i < sizeb) mpz_add (ret[i], a[i], b[i]); else mpz_set (ret[i], a[i]); } else { if (i < sizeb) mpz_set (ret[i], b[i]); else mpz_set_ui (ret[i], 0); } i++; } } static unsigned int TToomCookMul (listz_t b, unsigned int n, listz_t a, unsigned int m, listz_t c, unsigned int l, listz_t tmp) { unsigned int nu, mu, h; unsigned int i; unsigned int btmp; unsigned int tot_muls = 0; nu = n / 3 + 1; mu = m / 3 + 1; /* ensures n + 1 > 2 * nu */ if ((n < 2 * nu) || (m < 2 * mu)) { #ifdef DEBUG fprintf (ECM_STDOUT, "Too small operands, calling TKara.\n"); #endif return TKarMul (b, n, a, m, c, l, tmp); } /* First strip unnecessary trailing coefficients of c: */ l = MIN(l, n + m); /* Now the degenerate cases. We want 2 * nu <= m. * */ if (m < 2 * nu) { #ifdef DEBUG fprintf (ECM_STDOUT, "Degenerate Case 1.\n"); #endif tot_muls += TToomCookMul (b, nu - 1, a, m, c, l, tmp); if (l >= nu) tot_muls += TToomCookMul (b + nu, nu - 1, a, m, c + nu, l - nu, tmp); else list_zero (b + nu, nu); if (l >= 2 * nu) /* n >= 2 * nu is assured. Hopefully */ tot_muls += TToomCookMul (b + 2 * nu, n - 2 * nu, a, m, c + 2 * nu, l - 2 * nu, tmp); else list_zero (b + 2 * nu, n - 2 * nu + 1); return tot_muls; } /* Second degenerate case. We want 2 * mu <= n. */ if (n < 2 * mu) { #ifdef DEBUG fprintf (ECM_STDOUT, "Degenerate Case 2.\n"); #endif tot_muls += TToomCookMul (b, n, a, mu - 1, c, l, tmp); if (l >= mu) { tot_muls += TToomCookMul (tmp, n, a + mu, mu - 1, c + mu, l - mu, tmp + n + 1); list_add (b, b, tmp, n + 1); } if (l >= 2 * mu) { tot_muls += TToomCookMul (tmp, n, a + 2 * mu, m - 2 * mu, c + 2 * mu, l - 2 * mu, tmp + n + 1); list_add (b, b, tmp, n + 1); } return tot_muls; } #ifdef DEBUG fprintf (ECM_STDOUT, "Base Case.\n"); fprintf (ECM_STDOUT, "a = "); print_list (a, m + 1); fprintf (ECM_STDOUT, "\nc = "); print_list (c, l + 1); #endif h = MAX(nu, mu); nu = mu = h; list_sub_safe (tmp, c + 3 * h, c + h, (l + 1 > 3 * h ? l + 1 - 3 * h : 0), (l + 1 > h ? l + 1 - h : 0), 2 * h - 1); list_sub_safe (tmp + 2 * h - 1, c, c + 2 * h, l + 1, (l + 1 > 2 * h ? l + 1 - 2 * h : 0), 2 * h - 1); for (i = 0; i < 2 * h - 1; i++) mpz_mul_2exp (tmp[2 * h - 1 + i], tmp[2 * h - 1 + i], 1); #ifdef DEBUG print_list (tmp, 4 * h - 2); #endif /* -------------------------------- * | 0 .. 2*h-2 | 2*h-1 .. 4*h-3 | * -------------------------------- * | c3 - c1 | 2(c0 - c2) | * -------------------------------- */ list_add (tmp + 2 * h - 1, tmp + 2 * h - 1, tmp, 2 * h - 1); tot_muls += TToomCookMul (b, h - 1, a, h - 1, tmp + 2 * h - 1, 2 * h - 2, tmp + 4 * h - 2); /* b[0 .. h - 1] = 2 * m0 */ #ifdef DEBUG fprintf (ECM_STDOUT, "2 * m0 = "); print_list (b, h); #endif list_add (tmp + 2 * h - 1, a, a + h, h); list_add (tmp + 2 * h - 1, tmp + 2 * h - 1, a + 2 * h, MIN(h, m + 1 - 2 * h)); /* tmp[2*h-1 .. 3*h-2] = a0 + a1 + a2 */ #ifdef DEBUG fprintf (ECM_STDOUT, "\na0 + a1 + a2 = "); print_list (tmp + 2 * h - 1, h); #endif list_sub_safe (tmp + 3 * h - 1, c + 2 * h, c + 3 * h, (l + 1 > 2 * h ? l + 1 - 2 * h : 0), (l + 1 > 3 * h ? l + 1 - 3 * h : 0), 2 * h - 1); /* ------------------------------------------------- * | 0 .. 2*h-2 | 2*h-1 .. 3*h-2 | 3*h-1 .. 5*h-3 | * ------------------------------------------------- * | c3 - c1 | a0 + a1 + a2 | c2 - c3 | * ------------------------------------------------- */ btmp = (l + 1 > h ? l + 1 - h : 0); btmp = MIN(btmp, 2 * h - 1); for (i = 0; i < btmp; i++) { mpz_mul_2exp (tmp[5 * h - 2 + i], c[h + i], 1); mpz_add (tmp[5 * h - 2 + i], tmp[5 * h - 2 + i], tmp[3 * h - 1 + i]); } while (i < 2 * h - 1) { mpz_set (tmp[5 * h - 2 + i], tmp[3 * h - 1 + i]); i++; } tot_muls += TToomCookMul (b + h, h - 1, tmp + 2 * h - 1, h - 1, tmp + 5 * h - 2, 2 * h - 2, tmp + 7 * h - 3); /* b[h .. 2 * h - 1] = 2 * m1 */ #ifdef DEBUG fprintf (ECM_STDOUT, "\n2 * m1 = "); print_list (b + h, h); #endif /* ------------------------------------------------------------------ * | 0 .. 2*h-2 | 2*h-1 .. 3*h-2 | 3*h-1 .. 5*h-3 | 5*h-2 .. 7*h-4 | * ------------------------------------------------------------------ * | c3 - c1 | a0 + a1 + a2 | c2 - c3 | c2 - c3 + 2c1 | * ------------------------------------------------------------------ */ for (i = 0; i < h; i++) { mpz_add (tmp[2 * h - 1 + i], tmp[2 * h - 1 + i], a[i + h]); if (2 * h + i <= m) mpz_addmul_ui (tmp[2 * h - 1 + i], a[2 * h + i], 3); } tot_muls += TToomCookMul (tmp + 5 * h - 2, h - 1, tmp + 2 * h - 1, h - 1, tmp, 2 * h - 2, tmp + 6 * h - 2); /* tmp[5*h-2 .. 6*h - 3] = 6 * m2 */ #ifdef DEBUG fprintf (ECM_STDOUT, "\n6 * m2 = "); print_list (tmp + 5 * h - 2, h); #endif for (i = 0; i < h; i++) { mpz_sub (tmp[2 * h - 1 + i], a[i], a[h + i]); if (i + 2 * h <= m) mpz_add (tmp[2 * h - 1 + i], tmp[2 * h - 1 + i], a[2 * h + i]); } for (i = 0; i < 2 * h - 1; i++) { mpz_mul_ui (tmp[3 * h - 1 + i], tmp[3 * h - 1 + i], 3); mpz_mul_2exp (tmp[i], tmp[i], 1); } list_add (tmp + 3 * h - 1, tmp + 3 * h - 1, tmp, 2 * h - 1); tot_muls += TToomCookMul (tmp + 6 * h - 2, h - 1, tmp + 2 * h - 1, h - 1, tmp + 3 * h - 1, 2 * h - 2, tmp + 7 * h - 2); /* tmp[6h-2 .. 7h - 3] = 6 * mm1 */ #ifdef DEBUG fprintf (ECM_STDOUT, "\n6 * mm1 = "); print_list (tmp + 6 * h - 2, h); #endif list_add_safe (tmp, tmp, c + 2 * h, 2 * h, (l + 1 > 2 * h ? l + 1 - 2 * h : 0), 2 * h - 1); list_sub_safe (tmp, c + 4 * h, tmp, (l + 1 > 4 * h ? l + 1 - 4 * h : 0), 2 * h - 1, 2 * h - 1); tot_muls += TToomCookMul (b + 2 * h, n - 2 * h, a + 2 * h, m - 2 * h, tmp, 2 * h - 1, tmp + 7 * h - 2); /* b[2 * h .. n] = minf */ #ifdef DEBUG fprintf (ECM_STDOUT, "\nminf = "); print_list (b + 2 * h, n + 1 - 2 * h); #endif /* Layout of b : * --------------------------------------- * | 0 ... h-1 | h ... 2*h-1 | 2*h ... n | * --------------------------------------- * | 2 * m0 | 2 * m1 | minf | * --------------------------------------- * * Layout of tmp : * --------------------------------------------------- * | 0 ... 5*h-1 | 5*h-2 ... 6*h-3 | 6*h-2 ... 7*h-3 | * --------------------------------------------------- * | ?????? | 6 * m2 | 6 * mm1 | * --------------------------------------------------- */ list_add (tmp, tmp + 5 * h - 2, tmp + 6 * h - 2, h); for (i = 0; i < h; i++) mpz_divby3_1op (tmp[i]); /* t1 = 2 (m2 + mm1) * tmp[0 .. h - 1] = t1 */ list_add (b, b, b + h, h); list_add (b, b, tmp, h); for (i = 0; i < h; i++) mpz_tdiv_q_2exp (b[i], b[i], 1); /* b_{low} should be correct */ list_add (tmp + h, b + h, tmp, h); /* t2 = t1 + 2 m1 * tmp[h .. 2h - 1] = t2 */ list_add (b + h, tmp, tmp + h, h); list_sub (b + h, b + h, tmp + 6 * h - 2, h); for (i = 0; i < h; i++) mpz_tdiv_q_2exp (b[h + i], b[h + i], 1); /* b_{mid} should be correct */ list_add (tmp + h, tmp + h, tmp + 5 * h - 2, n + 1 - 2 * h); for (i = 0; i < n + 1 - 2 * h; i++) mpz_tdiv_q_2exp (tmp[h + i], tmp[h + i], 1); list_add (b + 2 * h, b + 2 * h, tmp + h, n + 1 - 2 * h); /* b_{high} should be correct */ return tot_muls; } /* Returns space needed by TToomCookMul */ unsigned int TToomCookMul_space (unsigned int n, unsigned int m, unsigned int l) { unsigned int nu, mu, h; unsigned int stmp1, stmp2; nu = n / 3 + 1; mu = m / 3 + 1; stmp1 = stmp2 = 0; /* ensures n + 1 > 2 * nu */ if ((n < 2 * nu) || (m < 2 * mu)) return TKarMul_space (n, m, l); /* First strip unnecessary trailing coefficients of c: */ l = MIN(l, n + m); /* Now the degenerate cases. We want 2 * nu < m. * */ if (m <= 2 * nu) { stmp1 = TToomCookMul_space (nu - 1, m, l); if (l >= 2 * nu) stmp2 = TToomCookMul_space (n - 2 * nu, m, l - 2 * nu); else if (l >= nu) stmp2 = TToomCookMul_space (nu - 1, m, l - nu); return MAX(stmp1, stmp2); } /* Second degenerate case. We want 2 * mu < n. */ if (n <= 2 * mu) { stmp1 += TToomCookMul_space (n, mu - 1, l); if (l >= 2 * mu) stmp2 = TToomCookMul_space (n, m - 2 * mu, l - 2 * mu) + n + 1; else if (l >= mu) stmp2 = TToomCookMul_space (n, mu - 1, l - mu) + n + 1; return MAX(stmp1, stmp2); } h = MAX(nu, mu); stmp1 = TToomCookMul_space (h - 1, h - 1, 2 * h - 2); stmp2 = stmp1 + 7 * h - 2; stmp1 = stmp1 + 6 * h - 2; stmp1 = MAX(stmp1, stmp2); stmp2 = TToomCookMul_space (n - 2 * h, m - 2 * h, 2 * h - 1) + 7*h-2; return MAX(stmp1, stmp2); } /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients of degree m to n+m of rev(a)*c, i.e. b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l) ... b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k) ... b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n]. Using auxiliary memory in tmp. Assumes n <= l. Returns number of multiplications if known, 0 if not known, and -1 for error. */ int TMulGen (listz_t b, unsigned int n, listz_t a, unsigned int m, listz_t c, unsigned int l, listz_t tmp, mpz_t modulus) { ASSERT (n <= l); if (Fermat) { unsigned int i; for (i = l + 1; i > 1 && (i&1) == 0; i >>= 1); ASSERT(i == 1); ASSERT(n + 1 == (l + 1) / 2); ASSERT(m == l - n || m + 1 == l - n); return F_mul_trans (b, a, c, m + 1, l + 1, Fermat, tmp); } #ifdef KS_MULTIPLY if ((double) n * (double) mpz_sizeinbase (modulus, 2) >= KS_TMUL_THRESHOLD) { if (TMulKS (b, n, a, m, c, l, modulus, 1)) /* Non-zero means error */ return -1; return 0; /* We have no mul count so we return 0 */ } #endif return TToomCookMul (b, n, a, m, c, l, tmp); } unsigned int TMulGen_space (unsigned int n, unsigned int m, unsigned int l) { if (Fermat) return 2 * (l + 1); else return TToomCookMul_space (n, m, l); } ecm-6.4.4/resume.c0000644023561000001540000003460212106741273010667 00000000000000/* Functions for reading a writing resume file lines. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012 Paul Zimmermann, Alexander Kruppa and Cyril Bouvier. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #if !defined (_MSC_VER) #include #endif #include #include "ecm.h" #include "ecm-ecm.h" #ifdef HAVE_FCNTL_H #include #endif #if defined (_MSC_VER) || defined (__MINGW32__) /* needed to declare GetComputerName() for write_resumefile_line() */ #include #endif /* Reads a string of characters from fd while they match the string s. Returns the number of matching characters that were read. */ static int facceptstr (FILE *fd, char *s) { int c; unsigned i = 0; while (s[i] != 0 && (c = fgetc (fd)) != EOF) { if (c != s[i++]) { ungetc (c, fd); return i-1; } } return i; } /* Accepts "\n" or "\r\n" or "\r". Returns 1 if any of the three was read, 0 otherwise */ static int facceptnl (FILE *fd) { int c, r = 0; c = fgetc (fd); if (c == '\r') { c = fgetc (fd); r = 1; } if (c == '\n') r = 1; else if (c != EOF) ungetc (c, fd); return r; } /* Reads a string from fd until the character "delim" or newline is seen, or "len" characters have been written to "s" (including terminating null), or EOF is reached. The "delim" and newline characters are left on the stream. If s is NULL, characters are read from fd but not written anywhere. Returns the number of characters read. */ static int freadstrn (FILE *fd, char *s, char delim, unsigned int len) { unsigned int i = 0; int c; while (i + 1 < len && (c = fgetc (fd)) != EOF) if (c == delim || IS_NEWLINE(c)) { ungetc (c, fd); break; } else if (s != NULL) s[i++] = (char) c; if (i < len && s != NULL) s[i++] = 0; return i; } /* Reads an assignment from a save file. Return 1 if an assignment was successfully read, 0 if there are no more lines to read (at EOF) */ int read_resumefile_line (int *method, mpz_t x, mpcandi_t *n, mpz_t sigma, mpz_t A, mpz_t x0, double *b1, char *program, char *who, char *rtime, char *comment, FILE *fd) { int a, have_method, have_x, have_z, have_n, have_sigma, have_a, have_b1, have_checksum, have_qx; unsigned int saved_checksum; char tag[16]; mpz_t z; while (!feof (fd)) { /* Ignore empty lines */ if (facceptnl (fd)) { continue; } /* Ignore lines beginning with '#'*/ if (facceptstr (fd, "#")) { while (!facceptnl (fd) && !feof (fd)) fgetc (fd); continue; } if (feof (fd)) break; have_method = have_x = have_z = have_n = have_sigma = have_a = have_b1 = have_qx = have_checksum = 0; /* Set optional fields to zero */ mpz_set_ui (sigma, 0); mpz_set_ui (A, 0); if (program != NULL) program[0] = 0; if (who != NULL) who[0] = 0; if (rtime != NULL) rtime[0] = 0; if (comment != NULL) comment[0] = 0; while (!facceptnl (fd) && !feof (fd)) { freadstrn (fd, tag, '=', 16); if (!facceptstr (fd, "=")) { printf ("Save file line has no equal sign after: %s\n", tag); goto error; } if (strcmp (tag, "METHOD") == 0) { if (facceptstr (fd, "ECM") == 3) { *method = ECM_ECM; } else if (facceptstr (fd, "P")) { a = facceptstr (fd, "-1"); if (a == 2) { *method = ECM_PM1; } else if (a == 0 && facceptstr (fd, "+1") == 2) { *method = ECM_PP1; } else goto error; } else goto error; have_method = 1; } else if (strcmp (tag, "X") == 0) { mpz_inp_str (x, fd, 0); have_x = 1; } else if (strcmp (tag, "Z") == 0) { mpz_init (z); mpz_inp_str (z, fd, 0); have_z = 1; } else if (strcmp (tag, "QX") == 0) { mpz_inp_str (x, fd, 0); have_qx = 1; } else if (strcmp (tag, "X0") == 0) { mpz_inp_str (x0, fd, 0); } else if (strcmp (tag, "CHECKSUM") == 0) { if (fscanf (fd, "%u", &saved_checksum) != 1) goto error; have_checksum = 1; } else if (strcmp (tag, "COMMENT") == 0) { freadstrn (fd, comment, ';', 255); } else if (strcmp (tag, "N") == 0) { /*mpz_inp_str (n, fd, 0);*/ /* we want to "maintain" any expressions, which were possibly stored in the file for N */ have_n = read_number (n, fd, 0); } else if (strcmp (tag, "SIGMA") == 0) { mpz_inp_str (sigma, fd, 0); have_sigma = 1; } else if (strcmp (tag, "A") == 0) { mpz_inp_str (A, fd, 0); have_a = 1; } else if (strcmp (tag, "B1") == 0) { if (fscanf (fd, "%lf", b1) != 1) goto error; have_b1 = 1; } else if (strcmp (tag, "PROGRAM") == 0) { freadstrn (fd, program, ';', 255); } else if (strcmp (tag, "WHO") == 0) { freadstrn (fd, who, ';', 255); } else if (strcmp (tag, "TIME") == 0) { freadstrn (fd, rtime, ';', 255); } else /* Not a tag we know about */ { printf ("Save file line has unknown tag: %s\n", tag); goto error; } /* Prime95 lines have no semicolon after SIGMA */ if (!facceptstr (fd, ";") && ! (have_qx && have_n && have_sigma)) { printf ("%s field not followed by semicolon\n", tag); goto error; } while (facceptstr (fd, " ")); } /* Finished reading tags */ /* Handle Prime95 v22 lines. These have no METHOD=ECM field and QX= instead of X= */ if (have_qx) { if (have_n && have_sigma) { *method = ECM_ECM; /* *b1 = 1.0; */ strcpy (program, "Prime95"); mpz_mod (x, x, n->n); return 1; } goto error; } #ifdef DEBUG if (*method != ECM_ECM && (have_sigma || have_a || have_z)) { int count = have_sigma + have_a + have_z; printf ("Warning: Save file line has"); if (have_sigma) { printf (" SIGMA"); mpz_set_ui (sigma, 0); if (--count > 1) printf (","); else if (count > 0) printf (" and"); } if (have_a) { printf (" A"); mpz_set_ui (A, 0); if (--count > 0) printf (" and"); } if (have_z) { printf (" Z"); mpz_clear (Z); have_z = 0; } printf (" value for method other than ECM.\n"); } #endif if (!have_method || !have_x || !have_n || !have_b1 || (method == ECM_ECM && !have_sigma && !have_a)) { fprintf (stderr, "Save file line lacks fields\n"); continue; } if (have_checksum) { mpz_t checksum; mpz_init (checksum); mpz_set_d (checksum, *b1); if (have_sigma) mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (sigma, CHKSUMMOD)); if (have_a) mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (A, CHKSUMMOD)); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (n->n, CHKSUMMOD)); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (x, CHKSUMMOD)); if (have_z) mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (z, CHKSUMMOD)); if (mpz_fdiv_ui (checksum, CHKSUMMOD) != saved_checksum) { fprintf (stderr, "Resume file line has bad checksum %u, expected %lu\n", saved_checksum, mpz_fdiv_ui (checksum, CHKSUMMOD)); mpz_clear (checksum); continue; } mpz_clear (checksum); } mpz_mod (x, x, n->n); if (have_z) /* Must normalize */ { if (!mpz_invert (z, z, n->n)) /* Factor found? */ { /* Oh great. What do we do with it now? */ /* mpres_gcd (f, z, n); */ printf ("Oops, factor found while reading from save file.\n"); } mpz_mul (z, z, x); mpz_mod (x, z, n->n); } return 1; error: /* In case of error, read rest of line and try next line */ while (!facceptnl (fd) && !feof (fd)) fgetc (fd); } /* We hit EOF without reading a proper save line */ return 0; } /* Append a residue to the savefile with name given in fn. Returns 1 on success, 0 on error */ int write_resumefile_line (char *fn, int method, double B1, mpz_t sigma, mpz_t A, mpz_t x, mpcandi_t *n, mpz_t x0, const char *comment) { FILE *file; mpz_t checksum; time_t t; char text[256]; char *uname, mname[32]; #if defined(HAVE_FCNTL) && defined(HAVE_FILENO) struct flock lock; int r, fd; #endif #ifdef DEBUG if (fn == NULL) { fprintf (stderr, "write_resumefile_line: fn == NULL\n"); exit (EXIT_FAILURE); } #endif file = fopen (fn, "a"); if (file == NULL) { fprintf (stderr, "Could not open file %s for writing\n", fn); return 0; } #if defined(HAVE_FCNTL) && defined(HAVE_FILENO) /* Try to get a lock on the file so several processes can append to the same file safely */ /* Supposedly some implementations of fcntl() can get confused over garbage in unused fields in a flock struct, so zero it */ memset (&lock, 0, sizeof (struct flock)); fd = fileno (file); lock.l_type = F_WRLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 1; /* F_SETLKW: blocking exclusive lock request */ r = fcntl (fd, F_SETLKW, &lock); if (r != 0) { fclose (file); return 0; } fseek (file, 0, SEEK_END); #endif mpz_init (checksum); mpz_set_d (checksum, B1); fprintf (file, "METHOD="); if (method == ECM_PM1) fprintf (file, "P-1"); else if (method == ECM_PP1) fprintf (file, "P+1"); else { fprintf (file, "ECM"); if (mpz_sgn (sigma) != 0) { fprintf (file, "; SIGMA="); mpz_out_str (file, 10, sigma); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (sigma, CHKSUMMOD)); } else if (mpz_sgn (A) != 0) { fprintf (file, "; A="); mpz_out_str (file, 10, A); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (A, CHKSUMMOD)); } } fprintf (file, "; B1=%.0f; N=", B1); if (n->cpExpr) fprintf(file, "%s", n->cpExpr); else mpz_out_str (file, 10, n->n); fprintf (file, "; X=0x"); mpz_out_str (file, 16, x); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (n->n, CHKSUMMOD)); mpz_mul_ui (checksum, checksum, mpz_fdiv_ui (x, CHKSUMMOD)); #ifdef GPUECM fprintf (file, "; CHECKSUM=%lu; PROGRAM=GPU-ECM %s;", mpz_fdiv_ui (checksum, CHKSUMMOD), VERSION_GPU); #else fprintf (file, "; CHECKSUM=%lu; PROGRAM=GMP-ECM %s;", mpz_fdiv_ui (checksum, CHKSUMMOD), VERSION); #endif mpz_clear (checksum); if (mpz_sgn (x0) != 0) { fprintf (file, " X0=0x"); mpz_out_str (file, 16, x0); fprintf (file, ";"); } /* Try to get the users and his machines name */ /* TODO: how to make portable? */ uname = getenv ("LOGNAME"); if (uname == NULL) uname = getenv ("USERNAME"); if (uname == NULL) uname = ""; #if defined (_MSC_VER) || defined (__MINGW32__) /* dummy block, so that the vars needed here don't need to "spill" over to the rest of the function. */ { DWORD size, i; TCHAR T[MAX_COMPUTERNAME_LENGTH+2]; size=MAX_COMPUTERNAME_LENGTH+1; if (!GetComputerName(T, &size)) strcpy(mname, "localPC"); else { for (i = 0; i < sizeof(mname)-1; ++i) mname[i] = T[i]; mname[sizeof(mname)-1] = 0; } } #else if (gethostname (mname, 32) != 0) mname[0] = 0; mname[31] = 0; /* gethostname() may omit trailing 0 if hostname >31 chars */ #endif if (uname[0] != 0 || mname[0] != 0) { fprintf (file, " WHO=%.233s@%.32s;", uname, mname); } if (comment[0] != 0) fprintf (file, " COMMENT=%.255s;", comment); t = time (NULL); strncpy (text, ctime (&t), 255); text[255] = 0; text[strlen (text) - 1] = 0; /* Remove newline */ fprintf (file, " TIME=%s;", text); fprintf (file, "\n"); fflush (file); #if defined(HAVE_FCNTL) && defined(HAVE_FILENO) lock.l_type = F_UNLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 1; fcntl (fd, F_SETLKW, &lock); /* F_SETLKW: blocking lock request */ #endif fclose (file); return 1; } ecm-6.4.4/bestd.c0000644023561000001540000002517412106741273010474 00000000000000/* Choice of best parameters for stage 2. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2010 Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-impl.h" /* Compute (d, d2, k) such that: (0) k >= k0 (1) d is a multiple of 6 (2) k * d * (eulerphi(d)/2) * d2 / eulerphi(d2) >= B2 - B2min (3) gcd(d, d2) == 1 (4) k is minimal, subject to previous conditions (5) if parameter po2 is != 0, rounds dF up to a power of 2 Return non-zero iff an error occurred (too large step 2 bound). */ /* How we test whether given d,d2,dF,k,i0 parameters cover the desired B2min-B2 range: In stage 2 we generate all values p = f(i * d) +- f(j * d2) with 1. gcd (i, d2) == 1, 2. gcd (j, d) == 1, 3. j == 1 (mod 6), 4. 6|d 5. 1 <= j <= d - 5, (it's -5, not just -1, because of 3. and 4.) 6. i0 <= i <= i1 7. gcd (d, d2) == 1 where f(x) is x^S or the S-th Dickson polynomial g_{S,-1}(x). Extra factors included by S>1 are not considered in this analysis, we assume S=1, f(x)=x so that p = i * d +- j * d2. (Note: i values greater than stated in 3. may be generated if we have to round up dF, for example to a power of 2. However, the root generation code can put anything it likes in those extra roots, so we make no assumption here that this will extend the range of the i values.) Hence the values at the high end of the stage 2 range that are not generated are p = (i1 + n) * d +- j * d2, n > 0 and the smallest one of those is p = (i1 + 1) * d - (d - 5) * d2 = d * (i1 - d2 + 1) + 5 * d2 At the low end of stage 2, values not generated are p = (i0 - n) * d +- j * d2, n > 0 the largest one being p = (i0 - 1) * d + (d - 5) * d2 = d * (i0 + d2 - 1) - 5*d2 Thus, values p that are coprime do d*d2 and d * (i0 + d2 - 1) - 5*d2 + 1 <= p <= d * (i1 - d2 + 1) + 5 * d2 - 1 are included in stage 2. The number of roots of G we compute is k * dF. For d2 == 1, this means i1 = i0 + k * dF - 1 (-1 because both i0 and i1 are included). For d2 > 1, values j not coprime to d2 are skipped (see condition 1). The number of values in [1, i0] that are not coprime to d2 (with d2 prime) is floor (i0 / d2); in [1, i1] it is floor (i1 / d2). So we require that k * dF >= i1 - i0 + 1 - (floor (i1 / d2) - floor (i0 / d2)) */ int bestD (root_params_t *root_params, unsigned long *finalk, unsigned long *finaldF, mpz_t B2min, mpz_t B2, int po2, int use_ntt, double maxmem, int treefile, mpmod_t modulus) { /* the following list contains successive values of b with increasing values of eulerphi(b). It was generated by the following Maple program: l := [[1,1]]: for b from 12 by 6 do d:=numtheory[phi](b)/2; while d <= l[nops(l)][2] do l:=subsop(nops(l)=NULL, l) od; n := nops(l); if b>1.1*l[n][1] then l := [op(l), [b,d]]; lprint(l) fi; od: */ #define N 109 static unsigned int l[N] = {12, 18, 30, 42, 60, 90, 120, 150, 210, 240, 270, 330, 420, 510, 630, 840, 1050, 1260, 1470, 1680, 1890, 2310, 2730, 3150, 3570, 3990, 4620, 5460, 6090, 6930, 8190, 9240, 10920, 12180, 13860, 16170, 18480, 20790, 23100, 30030, 34650, 39270, 43890, 48510, 60060, 66990, 78540, 90090, 99330, 120120, 133980, 150150, 180180, 210210, 240240, 270270, 300300, 334950, 371280, 420420, 510510, 570570, 600600, 630630, 746130, 870870, 1021020, 1141140, 1291290, 1531530, 1711710, 1891890, 2081310, 2312310, 2552550, 2852850, 3183180, 3573570, 3993990, 4594590, 5105100, 5705700, 6322470, 7147140, 7987980, 8978970, 10210200, 11741730, 13123110, 14804790, 16546530, 19399380, 21411390, 23993970, 26816790, 29609580, 33093060, 36606570, 40330290, 44414370, 49639590, 54624570, 60090030, 67897830, 77597520, 87297210, 96996900, 107056950, 118107990}; #define Npo2 23 static unsigned int lpo2[Npo2] = {12, 30, 60, 120, 240, 510, 1020, 2310, 4620, 9240, 19110, 39270, 79170, 158340, 324870, 690690, 1345890, 2852850, 5705700, 11741730, 23130030, 48498450, 96996900}; unsigned long i, d1 = 0, d2 = 0, dF = 0, phid, k, maxN; mpz_t j, t, i0, i1; int r = 0; if (mpz_cmp (B2, B2min) < 0) { /* No stage 2. Set relevant parameters to 0. Leave B2, B2min the same */ *finalk = 0; *finaldF = 0; return 0; } MPZ_INIT (i0); MPZ_INIT (i1); MPZ_INIT (j); MPZ_INIT (t); k = *finalk; /* User specified k value passed in via finalk */ /* Look for largest dF we can use while satisfying the maxmem parameter */ maxN = (po2) ? Npo2 : N; if (maxmem != 0.) { for (i = 0; i < maxN; i++) { int lg_dF, sp_num = 0; double memory; d1 = (po2) ? lpo2[i] : l[i]; phid = eulerphi (d1) / 2; dF = (po2) ? 1U << ceil_log2 (phid) : phid; lg_dF = ceil_log2 (dF); if (use_ntt) sp_num = (2 * mpz_sizeinbase (modulus->orig_modulus, 2) + lg_dF) / SP_NUMB_BITS + 4; memory = memory_use (dF, sp_num, (treefile) ? 0 : lg_dF, modulus); outputf (OUTPUT_DEVVERBOSE, "Estimated mem for dF = %.0d, sp_num = %d: %.0f\n", dF, sp_num, memory); if (memory > maxmem) break; } maxN = i; } for (i = 0; i < maxN; i++) { d1 = (po2) ? lpo2[i] : l[i]; phid = eulerphi (d1) / 2; dF = (po2) ? 1U << ceil_log2 (phid) : phid; /* Look for smallest prime < 25 that does not divide d1 */ /* The caller can force d2 = 1 by setting root_params->d2 != 0 */ d2 = 1; if (root_params->d2 == 0) for (d2 = 5; d2 < 25; d2 += 2) { if (d2 % 3 == 0) continue; if (d1 % d2 > 0) break; } if (d2 >= 25 || d2 - 1 > dF) d2 = 1; #if 0 /* The code to init roots of G can handle negative i0 now. */ if (d2 > 1 && mpz_cmp_ui (B2min, (d1 - 1) * d2 - d1) <= 0) d2 = 1; /* Would make i0 < 0 */ #endif mpz_set_ui (i0, d1 - 1); mpz_mul_ui (i0, i0, d2); mpz_set (j, B2); mpz_add (i1, j, i0); /* i1 = B2 + (d1 - 1) * d2 */ mpz_set (j, B2min); mpz_sub (i0, j, i0); /* i0 = B2min - (d1 - 1) * d2 */ mpz_cdiv_q_ui (i0, i0, d1); /* i0 = ceil ((B2min - (d1 - 1) * d2) / d1) */ mpz_fdiv_q_ui (i1, i1, d1); /* i1 = floor ((B2 + (d1 - 1) * d2) / d1) */ /* How many roots of G will we need ? */ mpz_sub (j, i1, i0); mpz_add_ui (j, j, 1); /* Integer multiples of d2 are skipped (if d2 > 1) */ if (d2 > 1) { mpz_fdiv_q_ui (t, i1, d2); mpz_sub (j, j, t); mpz_fdiv_q_ui (t, i0, d2); mpz_add (j, j, t); /* j -= floor (i1 / d2) - floor (i0 / d2) */ } /* How many blocks will we need ? Divide lines by dF, rounding up */ mpz_cdiv_q_ui (j, j, dF); if ((k != ECM_DEFAULT_K && mpz_cmp_ui (j, k) <= 0) || (k == ECM_DEFAULT_K && mpz_cmp_ui (j, (po2) ? 6 : 2) <= 0)) break; } if (i == maxN) { if (k != ECM_DEFAULT_K) { /* The user asked for a specific k and we couldn't satisfy the condition. Nothing we can do ... */ outputf (OUTPUT_ERROR, "Error: too large step 2 bound, increase -k\n"); r = ECM_ERROR; goto clear_and_exit; } else if (!mpz_fits_ulong_p (j)) { /* Can't fit the number of blocks in an unsigned long. Nothing we can do ... */ outputf (OUTPUT_ERROR, "Error: stage 2 interval too large, cannot " "generate suitable parameters.\nTry a smaller B2 value.\n"); r = ECM_ERROR; goto clear_and_exit; } if (maxN == 0) { /* We can't do a stage 2 at all with the memory the user allowed. Nothing we can do ... */ outputf (OUTPUT_ERROR, "Error: stage 2 not possible with memory " "allowed by -maxmem.\n"); r = ECM_ERROR; goto clear_and_exit; } /* else: We can fit the number of blocks into an unsigned int, so we use it. This may be a very large value for huge B2-B2min, the user is going to notice sooner or later */ } /* If the user specified a number of blocks, we'll use that no matter what. Since j may be smaller than k, this may increase the B2 limit */ if (k == ECM_DEFAULT_K) k = mpz_get_ui (j); /* Now that we have the number of blocks, compute real i1. There will be k * dF roots of G computed, starting at i0, skipping all that are not coprime to d2. While d2 is prime, that means: are not multiples of d2. Hence we want i1 so that i1 - floor(i1 / d2) - i0 + ceil((i0 / d2) == k * dF i1 - floor(i1 / d2) == k * dF + i0 - ceil((i0 / d2) */ mpz_set_ui (j, k); mpz_mul_ui (j, j, dF); if (d2 == 1) { mpz_add (i1, i0, j); mpz_sub_ui (i1, i1, 1); } else { mpz_add (j, j, i0); mpz_cdiv_q_ui (t, i0, d2); mpz_sub (j, j, t); /* j = k * dF + i0 - ceil((i0 / d2) */ mpz_fdiv_qr_ui (j, t, j, d2 - 1); mpz_mul_ui (j, j, d2); mpz_add (i1, j, t); } root_params->d1 = d1; root_params->d2 = d2; mpz_set (root_params->i0, i0); *finaldF = dF; *finalk = k; /* We want B2' the largest integer that satisfies i1 = floor ((B2' + (d1 - 1) * d2) / d1) = floor ((B2'-d2)/d1) + d2 i1 - d2 = floor ((B2'-d2)/d1) (B2'-d2)/d1 < i1-d2+1 B2'-d2 < (i1-d2+1) * d1 B2' < (i1-d2+1) * d1 + d2 B2' = (i1-d2+1) * d1 + d2 - 1 */ mpz_sub_ui (i1, i1, d2 - 1); mpz_mul_ui (B2, i1, d1); mpz_add_ui (B2, B2, d2 - 1); clear_and_exit: mpz_clear (t); mpz_clear (j); mpz_clear (i1); mpz_clear (i0); return r; } ecm-6.4.4/Fgw.c0000644023561000001540000003355612106741273010121 00000000000000/* Interface code for George Woltman's gwnum library Copyright 2004, 2005, 2006, 2008, 2011, 2012 Paul Zimmermann, Alexander Kruppa, David Cleaver. Contains code based on the GWNUM library, copyright 2002-2005 George Woltman, Just For Fun Software, Inc. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include /* for rint */ #include #include "ecm-gmp.h" #include "ecm.h" #include "ecm-impl.h" #define ADD_UNDERSCORES #include "gwdbldbl.h" #include "gwnum.h" #include "cpuid.h" void __gxx_personality_v0() { exit (EXIT_FAILURE); } void __cxa_guard_acquire () { return; } void __cxa_guard_release () { return; } static int sgn (const int i) { if (i == 0) return 0; return i > 0 ? 1 : -1; } /* With the following 2 functions, we try to find a representation of an input number in the form of z = k*b^n+c. If such a representation was found, set the the appropriate values and return 1. Otherwise, set b to zero and return 0. */ /* This function searches for a representation of z of the form k*b^n+c */ int kbnc_z (double *k, unsigned long *b, unsigned long *n, signed long *c, mpz_t z) { int i = 0; int j = 0; int exp = 1; int check_it_out = 0; int ret = 0; mpz_t diff; mpz_t abs_diff; mpz_t b_n; /* this will = base^exp */ mpz_t k_b_n; /* this will = k*b^n */ mpz_t test_k; mpz_t max_k; mpz_t lhs; /* used for finding the k value */ mpz_t rhs; /* used for finding the k value */ mpz_t base; mpz_t base_min; mpz_t base_max; unsigned long test_k_ui = 0; /* this puts a bound on how large our C value can be */ int max_diff = 8388607; /* make sure we have a place to put our results */ if (k == NULL || b == NULL || n == NULL || c == NULL) return 0; /* make sure the input meets some sort of minimum size requirement. The gwnum library reports ES1_CANNOT_DO_QUICKLY for number < 2^350 */ if (mpz_sizeinbase(z, 2) < 350) { *b = 0; return 0; } mpz_init (diff); mpz_init (abs_diff); mpz_init (b_n); mpz_init (k_b_n); mpz_init (lhs); mpz_init (rhs); mpz_init (test_k); mpz_init (base); mpz_init_set_ui (base_min, 2); mpz_init_set_ui (base_max, 10000); /* this puts a bound on how large of a k value we want to find */ mpz_init_set_str (max_k, "562949953421312", 10); /* when dividing: z/(base^exp) this will give us a possible k value */ /* we want a quick test to see if this might be a viable k value */ /* so, we want this k value to be close to an integer */ /* ie, test_k = 13.99999, is pretty close to the integer 14 */ /* since it is "pretty close", we can test this k value. */ /* whereas test_k = 13.5689, is not "pretty close" to an integer */ /* so, we will not run extra tests with this k value */ /* should we change this based on the size of z? */ /* for now, the code checks to see whether test_k is with 1/1000 of an integer */ for (mpz_set (base, base_min); mpz_cmp (base, base_max) <= 0; mpz_add_ui (base, base, 1)) { exp = (mpz_sizeinbase (z, 2) - 1) / (mpz_sizeinbase (base, 2) - 1) + 1; mpz_pow_ui (b_n, base, exp); /* base^exp should be > z here */ while (1) { check_it_out = 0; /* 0 */ mpz_tdiv_q (test_k, z, b_n); if (mpz_cmp(test_k, max_k) > 0) break; /* check to see if test_k is "pretty close" to the next smallest integer: z/b_n - test_k <= 1/1000 # z/b_n should be > test_k here z/b_n <= 1/1000 + test_k 1000*z/b_n <= 1 + 1000*test_k if (1000*z <= b_n + 1000*b_n*test_k) */ mpz_mul_ui (lhs, z, 1000); mpz_mul (rhs, b_n, test_k); mpz_mul_ui (rhs, rhs, 1000); mpz_add (rhs, rhs, b_n); if (mpz_cmp (lhs, rhs) <= 0) check_it_out = 1; /* check to see if test_k is "pretty close" to the next largest integer */ if (!check_it_out) { mpz_add_ui (test_k, test_k, 1); /* test_k - z/b_n <= 1/1000 # test_k should be > z/b_n here */ /* test_k <= 1/1000 + z/b_n */ /* test_k - 1/1000 <= z/b_n */ /* 1000*test_k - 1 <= 1000*z/b_n */ /* if (1000*b_n*test_k - b_n <= 1000*z) */ mpz_mul (lhs, b_n, test_k); mpz_mul_ui (lhs, lhs, 1000); mpz_sub (lhs, lhs, b_n); mpz_mul_ui (rhs, z, 1000); if (mpz_cmp (lhs, rhs) <= 0) check_it_out = 1; } if (check_it_out) { mpz_mul (k_b_n, b_n, test_k); mpz_sub (diff, z, k_b_n); mpz_abs (abs_diff, diff); if (mpz_cmp_ui (abs_diff, max_diff) <= 0) { /* make sure k and c are relatively prime */ if (mpz_gcd_ui (NULL, test_k, mpz_get_ui (diff)) == 1) { /* we are done!!! */ *k = mpz_get_d (test_k); *b = mpz_get_ui (base); *n = exp; *c = mpz_get_si (diff); ret = 1; goto end_kbnc; } else { *b = 0; ret = 0; goto end_kbnc; } } } mpz_divexact (b_n, b_n, base); exp--; } } /* if we get down here, then we couldn't find a representation k*b^n + c */ end_kbnc: mpz_clear (diff); mpz_clear (abs_diff); mpz_clear (b_n); mpz_clear (k_b_n); mpz_clear (lhs); mpz_clear (rhs); mpz_clear (test_k); mpz_clear (max_k); mpz_clear (base); mpz_clear (base_min); mpz_clear (base_max); return ret; } /* This function searches for a nice representation of z We are trying to see if z = k*b^n + c Some examples that we can find: "3^218+5123" "(199*3^218+5123)/(2*17*587*1187)" "(199*3^218 + 5123)/2/17/587/1187" */ int kbnc_str (double *k, unsigned long *b, unsigned long *n, signed long *c, char *z, mpz_t num) { int i = 0; int total = 0; char strk[11]; char strb[11]; char strn[11]; char strc[11]; mpz_t tmp; /* make sure we have a place to put our results */ if (k == NULL || b == NULL || n == NULL || c == NULL || z == NULL) return 0; *b = 0; for (i = 0; i < strlen(z); i++) { if (z[i] == '(' || z[i] == '{' || z[i] == '[') continue; /* check to see if the input is k*b^n+c */ total = sscanf (z+i, "%10[0-9]*%10[0-9]^%10[0-9]%*[ +]%10[0-9]", strk, strb, strn, strc); if (total == 4) { *k = (double) strtoul (strk, NULL, 10); *b = strtoul (strb, NULL, 10); *n = strtoul (strn, NULL, 10); *c = strtol (strc, NULL, 10); break; } /* check to see if the input is k*b^n-c */ total = sscanf (z+i, "%10[0-9]*%10[0-9]^%10[0-9]%*[ -]%10[0-9]", strk, strb, strn, strc); if (total == 4) { *k = (double) strtoul (strk, NULL, 10); *b = strtoul (strb, NULL, 10); *n = strtoul (strn, NULL, 10); *c = strtol (strc, NULL, 10); *c *= -1; break; } /* check to see if the input is b^n+c (k = 1) */ total = sscanf (z+i, "%10[0-9]^%10[0-9]%*[ +]%10[0-9]", strb, strn, strc); if (total == 3) { *k = 1.0; *b = strtoul (strb, NULL, 10); *n = strtoul (strn, NULL, 10); *c = strtol (strc, NULL, 10); break; } /* check to see if the input is b^n-c (k = 1) */ total = sscanf (z+i, "%10[0-9]^%10[0-9]%*[ -]%10[0-9]", strb, strn, strc); if (total == 3) { *k = 1.0; *b = strtoul (strb, NULL, 10); *n = strtoul (strn, NULL, 10); *c = strtol (strc, NULL, 10); *c *= -1; break; } break; } /* first, check to see if we found a k*b^n+c */ if (*b) { /* if we did, make sure that (k*b^n+c) is divisible by num */ mpz_init_set_ui (tmp, *b); mpz_pow_ui (tmp, tmp, *n); mpz_mul_ui (tmp, tmp, (unsigned long) *k); if (*c >= 0) mpz_add_ui (tmp, tmp, *c); else mpz_sub_ui (tmp, tmp, (*c * -1)); if (mpz_divisible_p (tmp, num)) return 1; } /* set b to zero so users have a second way to know we didn't find k,b,n,c */ *b = 0; /* if we get here, we didn't find a formula k*b^n+c for z */ return 0; } /* this method doesn't care if v is 32 or 64 bits... */ unsigned long gw_log_2(unsigned long v) { unsigned long r = 0; /* r will be lg(v) */ while (v >>= 1) { r++; } return r; } int gw_ecm_stage1 (mpz_t f, curve *P, mpmod_t modulus, double B1, double *B1done, mpz_t go, double gw_k, unsigned long gw_b, unsigned long gw_n, signed long gw_c) { ecm_uint gw_B1done = *B1done; unsigned long siz_x, siz_z; /* Size of gw_x and gw_y as longs */ mpz_t gw_x, gw_z, gw_A; int youpi; if (mpz_cmp_ui (go, 1) > 0) { mpres_t b; mpres_init (b, modulus); mpres_add_ui (b, P->A, 2, modulus); mpres_div_2exp (b, b, 2, modulus); /* b == (A+2)/4 */ ecm_mul (P->x, P->y, go, modulus, b); mpres_clear (b, modulus); } outputf (OUTPUT_VERBOSE, "Using gwnum_ecmStage1(%.0f, %d, %d, %d, %.0f, %ld)\n", gw_k, gw_b, gw_n, gw_c, B1, gw_B1done); /* Copy x, z and A values from modular representation to plain integers */ /* Allocate enough memory for any residue (mod k*b^n+c) for x, z */ mpz_init2 (gw_x, (gw_n+1)*gw_log_2(gw_b)+64); mpz_init2 (gw_z, (gw_n+1)*gw_log_2(gw_b)+64); mpz_init (gw_A); /* mpres_get_z always produces non-negative integers */ mpres_get_z (gw_x, P->x, modulus); mpres_get_z (gw_z, P->y, modulus); mpres_get_z (gw_A, P->A, modulus); /* gwnum_ecmStage1() wants long int pointers for size_x, size_z, so copy them into long int vars */ siz_x = SIZ(gw_x); siz_z = SIZ(gw_z); /* George Woltman says that the gwnum library can handle k values up to 49 or 50 bits long, and the maximum c value is +/-8388607 */ ASSERT_ALWAYS (gw_k == rint (gw_k)); /* check that k is an integer */ ASSERT_ALWAYS (1.0 <= gw_k && gw_k <= 562949953421312.0); ASSERT_ALWAYS (-8388607 <= gw_c && gw_c <= 8388607); #if GMP_NUMB_BITS <= 32 youpi = gwnum_ecmStage1_u32 (gw_k, gw_b, gw_n, gw_c, PTR(modulus->orig_modulus), ABSIZ(modulus->orig_modulus), B1, &gw_B1done, PTR(gw_A), ABSIZ(gw_A), PTR(gw_x), &siz_x, PTR(gw_z), &siz_z, NULL, 0); #else /* contributed by David Cleaver */ youpi = gwnum_ecmStage1_u64 (gw_k, gw_b, gw_n, gw_c, PTR(modulus->orig_modulus), ABSIZ(modulus->orig_modulus), B1, &gw_B1done, PTR(gw_A), ABSIZ(gw_A), PTR(gw_x), &siz_x, PTR(gw_z), &siz_z, NULL, 0); #endif /* Test that not more was written to gw_x and gw_z than we had space for */ ASSERT_ALWAYS (siz_x <= (unsigned long) ALLOC(gw_x)); ASSERT_ALWAYS (siz_z <= (unsigned long) ALLOC(gw_z)); SIZ(gw_x) = siz_x; SIZ(gw_z) = siz_z; outputf (OUTPUT_DEVVERBOSE, "gw_ecm_stage1: after gwnum_ecmStage1, \n" "B1done = %lu, x = %Zd\nz = %Zd\n", gw_B1done, gw_x, gw_z); /* Copy x, z back to P and clean up the temp vars */ mpres_set_z (P->x, gw_x, modulus); mpres_set_z (P->y, gw_z, modulus); mpz_clear (gw_A); mpz_clear (gw_z); mpz_clear (gw_x); *B1done = gw_B1done; /* Here is a list of gwnum return codes. */ /* In the case of 2 or 5, we should continue on and let gmp-ecm */ /* do stage 1, instead of throwing an error and quitting */ /* #define ES1_SUCCESS 0 *//* Success, but no factor */ /* #define ES1_FACTOR_FOUND 1 *//* Success, factor found */ /* #define ES1_CANNOT_DO_IT 2 *//* This k,b,n,c cannot be handled */ /* #define ES1_MEMORY 3 *//* Out of memory */ /* #define ES1_INTERRUPT 4 *//* Execution interrupted */ /* #define ES1_CANNOT_DO_QUICKLY 5 *//* Requires 3-multiply reduction */ /* #define ES1_HARDWARE_ERROR 6 *//* An error was detected, most */ if (youpi == ES1_CANNOT_DO_IT || youpi == ES1_CANNOT_DO_QUICKLY) { outputf (OUTPUT_VERBOSE, "Notice: Did not use gwnum_ecmStage1(%.0f, %d, %d, %d, %.0f, %ld)\n", gw_k, gw_b, gw_n, gw_c, B1, gw_B1done); youpi = ECM_NO_FACTOR_FOUND; goto end_of_gwecm; } if (youpi > 1) { outputf (OUTPUT_ERROR, "GW stage 1 returned code %d\n", youpi); youpi = ECM_ERROR; goto end_of_gwecm; } if (youpi == 1) { /* How did that happen? Since we passed z, GWNUM should not do an extgcd and so not find factors... but if it did anyways, we deal with it. Who's going to turn down a factor? */ outputf (OUTPUT_DEVVERBOSE, "gw_ecm_stage1: Strange, gwnum_ecmStage1 reports a factor\n"); mpres_get_z (f, P->x, modulus); youpi = ECM_FACTOR_FOUND_STEP1; goto end_of_gwecm; } /* Normalize z (in P->y) to 1 */ youpi = ECM_NO_FACTOR_FOUND; if (!mpres_invert (P->y, P->y, modulus)) /* Factor found? */ { mpres_gcd (f, P->y, modulus); youpi = ECM_FACTOR_FOUND_STEP1; } else { mpres_mul (P->x, P->x, P->y, modulus); mpres_set_ui (P->y, 1UL, modulus); } end_of_gwecm: return youpi; } ecm-6.4.4/test.pp10000755023561000001540000001043212106741273010622 00000000000000#!/bin/sh # test file for P+1 method # # Copyright 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2012 Jim Fougeron, # Alexander Kruppa, Dave Newman and Paul Zimmermann. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, see # http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., # 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. PP1="$1 -pp1" # Call with "checkcode $? n" to check that return code is n # (see test.pm1 for the explanation of the different return codes) checkcode () { if [ $1 != $2 ] then echo "############### ERROR ###############" echo "Expected return code $2 but got $1" exit 1 fi } checkcode2 () { if [ $1 != $2 ] then if [ $1 != $3 ] then echo "############### ERROR ###############" echo "Expected return code $2 or $3 but got $1" exit 1 fi fi } # P+1 requires that sigma^2-4 is a quadratic non-residue mod p echo 328006342451 | $PP1 -x0 5 120 7043; checkcode $? 8 # check rational seed echo 328006342451 | $PP1 -x0 1/5 120 7043; checkcode $? 8 # try primes < d in stage 2 echo 2050449218179969792522461197 | $PP1 -x0 6 -k 1 20 0-1e6; checkcode $? 14 echo 6215074747201 | $PP1 -x0 5 630 199729; checkcode $? 8 # bug in 6.1.3 echo 6215074747201 | $PP1 -power 2 -x0 5 630 199729; checkcode $? 8 echo 6215074747201 | $PP1 -dickson 3 -x0 5 630 199729; checkcode $? 8 echo 8857714771093 | $PP1 -x0 3 23251 49207; checkcode $? 8 echo 236344687097 | $PP1 -x0 3 619 55001; checkcode $? 8 echo 87251820842149 | $PP1 -x0 5 3691 170249; checkcode $? 8 echo 719571227339189 | $PP1 -x0 4 41039 57679; checkcode $? 8 echo 5468575720021 | $PP1 -x0 6 1439 175759; checkcode $? 8 echo 49804972211 | $PP1 -x0 5 15443 268757; checkcode $? 8 echo 329573417220613 | $PP1 -x0 3 5279 101573; checkcode $? 8 echo 4866979762781 | $PP1 -x0 4 7309 97609; checkcode $? 8 echo 187333846633 | $PP1 -x0 3 2063 9851; checkcode $? 8 echo 332526664667473 | $PP1 -x0 3 65993 111919; checkcode $? 8 echo 265043186297 | $PP1 -x0 3 8761 152791; checkcode $? 8 echo 207734163253 | $PP1 -x0 3 1877 4211; checkcode $? 8 echo 225974065503889 | $PP1 -x0 5 -k 5 7867 8243; checkcode $? 8 echo 660198074631409 | $PP1 -x0 5 22541 115679; checkcode $? 8 echo 563215815517 | $PP1 -x0 3 3469 109849; checkcode $? 8 # test B2min-B2 echo 563215815517 | $PP1 -x0 3 3469 109849-109849; checkcode $? 8 echo 409100738617 | $PP1 -x0 3 19 19; checkcode $? 8 # p37 from 45^123+1 found by Peter Montgomery with B1=30M echo 2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109 | $PP1 -x0 3 2337233 132554351 checkcode $? 14 # bug in ecm-5.0 (overflow in fin_diff_coeff) echo 630503947831861669 | $PP1 -x0 5 7 9007199254740000-9007199254741000; checkcode $? 8 # bug in ecm-6.0.1 on 64-bit machines. The error message "Error, maximal # step1 bound for P+1 is ..." on 32-bit machines is normal. echo "NOTE: NEXT TEST WILL FAIL ON 32BIT MACHINES, THIS IS EXPECTED." echo 8589934621 | $PP1 -x0 10 4294967310-4294967311 1; checkcode2 $? 1 8 # A test with a larger input number to test modular arithmetic routines not # in mulredc*.asm. This input has 1363 bits so it has 22 64 bit words # (43 32 bit words) and cannot use mulredc which handles only up to 20 limbs echo "6054018161*10^400+417727253109" | $PP1 -x0 4 2e3 2e6; checkcode $? 14 # Bug reported by Andreas Schickel: on 32 bit systems, the code in lucas.c # for generating Lucas chains is prone to causing integer overflows, giving # incorrect chains for some primes. This test exhibits the bug on 32 bit # systems but works on 64 bit echo 154618728587 | $PP1 -x0 3 -go 36 4294957296-4294967295 1; checkcode $? 8 echo 18446744073709551337 | $PP1 -pp1 -x0 2 70823 714487; checkcode $? 8 echo "All P+1 tests are ok." ecm-6.4.4/toomcook.c0000644023561000001540000003261312106741273011221 00000000000000/* Implementation of the Toom-Cook 3-way and 4-way algorithms for polynomial convolution products. Copyright 2001, 2002, 2003, 2004, 2005, 2006 Paul Zimmermann, Alexander Kruppa and Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "ecm-impl.h" #define A0 A[i] #define A1 A[l+i] #define A2 A[2*l+i] #define B0 B[i] #define B1 B[l+i] #define B2 B[2*l+i] #define C0 C[i] #define C1 C[l+i] #define C2 C[2*l+i] #define C3 C[3*l+i] #define C4 C[4*l+i] #define t0 t[i] #define t2 t[2*l+i-1] #define T t[4*l-2] /* Puts in C[0..2len-2] the product of A[0..len-1] and B[0..len-1]. This version works for all input sizes, but cannot handle input arrays overlapping with output. Assumes len >= 1. The auxiliary memory M(len) necessary in t satisfies: M(0) = 0, M(1) = 0, M(2) = 1, M(3) = 3, otherwise M(len) = 2*(2*l-1) + max(M(l), 1) with l = ceil(len/3). We prove M(len) <= 2*len + 2 * k with k = ceil(log[3](len)) by induction: 4*l-2 + max(M(l), 1) <= 4*l-2 + max(2*l + 2 * (k-1), 1) <= 6*l - 2 + 2 * (k-1) <= 2*(len+2) - 2 + 2 * (k-1) <= 2*len + 2 * k */ void toomcook3 (listz_t C, listz_t A, listz_t B, unsigned int len, listz_t t) { int i, l, k; if (len <= 2 || len == 4) { karatsuba (C, A, B, len, t); return; } l = (len + 2) / 3; /* ceil(len/3) */ k = len - 2 * l; /* smaller part */ for (i = 0; i < k; i++) /* uses t[0..3*l+k-1] */ { mpz_add (C0, A0, A2); mpz_sub (C2, C0, A1); /* C2 = A0 - A1 + A2 = A(-1) */ mpz_add (C0, C0, A1); /* C0 = A0 + A1 + A2 = A(1) */ mpz_add (C1, B0, B2); mpz_sub (C3, C1, B1); /* C3 = B0 - B1 + B2 = B(-1) */ mpz_add (C1, C1, B1); /* C1 = B0 + B1 + B2 = B(1) */ } for (; i < l; i++) /* uses t[0..4*l-1] */ { /* A2 and B2 are smaller than the rest */ mpz_add (C0, A0, A1); mpz_sub (C2, A0, A1); mpz_add (C1, B0, B1); mpz_sub (C3, B0, B1); } toomcook3 (t, C + 2 * l, C + 3 * l, l, &T); /* t0 = C2*C3 = A(-1)*B(-1) = C(-1), len(t0) = 2*l-1 */ for (i = 0; i < k; i++) { mpz_mul_2exp (C2, A2, 1); /* C2 = A(2), C3 = B(2) */ mpz_add (C2, C2, A1); mpz_mul_2exp (C2, C2, 1); mpz_add (C2, C2, A0); mpz_mul_2exp (C3, B2, 1); mpz_add (C3, C3, B1); mpz_mul_2exp (C3, C3, 1); mpz_add (C3, C3, B0); } for (; i < l; i++) { mpz_mul_2exp (C2, A1, 1); mpz_add (C2, C2, A0); mpz_mul_2exp (C3, B1, 1); mpz_add (C3, C3, B0); } toomcook3 (t + 2 * l - 1, C + 2 * l, C + 3 * l, l, &T); /* t2 = C2*C3 = A(2)*B(2) = C(2), len(t2) = 2*l-1 */ toomcook3 (C + 2 * l, C, C + l, l, &T); /* C2 = C0*C1 = A(1)*B(1) = C(1), len(C1) = 2*l-1 */ toomcook3 (C, A, B, l, &T); /* C0 = A(0)*B(0) = C(0), len(C0) = 2*l-1 */ toomcook3 (C + 4 * l, A + 2 * l, B + 2 * l, k, &T); /* C4 = A(inf)*B(inf) = C(inf), len(C4) = 2*k-1 */ /* C0: C_0 C2: C(1) C4: C_4 t0: C(-1) t2: C(2) */ /* C_3 = A_1 * B_2 + A_2 * B_1, len(C_3) = l+k-1 We need not bother to compute C_3[2l-1] if k= 0. For len=3, toomcook4 would use 6 multiplies, toomcook3 uses only 5. For len=6, toomcook4 would use 18 multiplies, toomcook3 only 15. For len=9, toomcook4 would use 30 multiplies, toomcook3 only 25. Further values where toomcook3 is faster are 17,18,26,27,77,78,79,80,81. */ if (len <= 2) { karatsuba (C, A, B, len, t); return; } if (len == 3 || len == 5 || len == 6 || len == 9 || len == 17 || len == 18 || (25 <= len && len <= 27) || (77 <= len && len <= 81)) { toomcook3 (C, A, B, len, t); return; } l = (len + 3) / 4; /* l = ceil(len/4) */ k = len - 3 * l; /* k = smaller part. len = 3*l + k, k <= l */ for (i = 0; i < l; i++) { /*** Evaluate A(2), A(-2), 8*A(1/2) ***/ mpz_mul_2exp (C0, A0, 1); mpz_add (C0, C0, A1); mpz_mul_2exp (C0, C0, 1); mpz_add (C0, C0, A2); mpz_mul_2exp (C0, C0, 1); if (i < k) { mpz_add (C0, C0, A3); /* C[0 .. l-1] = 8*A(1/2) */ mpz_mul_2exp (C2, A3, 2); mpz_add (C2, C2, A1); mpz_mul_2exp (C2, C2, 1); /* C[2l .. 3l-1] = 8*A_3 + 2*A_1 */ } else mpz_mul_2exp (C2, A1, 1); mpz_mul_2exp (T, A2, 2); mpz_add (T, T, A0); /* T = 4*A_2 + A0 */ mpz_sub (C4, T, C2); /* C[4l .. 5l-1] = A(-2) */ mpz_add (C2, C2, T); /* C[2l .. 3l-1] = A(2) */ #ifdef DEBUG gmp_fprintf (ECM_STDOUT, "8*A(1/2)[%d] = %Zd\n", i, C0); gmp_fprintf (ECM_STDOUT, "A(2)[%d] = %Zd\n", i, C2); gmp_fprintf (ECM_STDOUT, "A(-2)[%d] = %Zd\n", i, C4); #endif /*** Evaluate B(2), B(-2), 8*B(1/2) ***/ mpz_mul_2exp (C1, B0, 1); mpz_add (C1, C1, B1); mpz_mul_2exp (C1, C1, 1); mpz_add (C1, C1, B2); mpz_mul_2exp (C1, C1, 1); if (i < k) { mpz_add (C1, C1, B3); /* C[l .. 2l-1] = 8*B(1/2) */ mpz_mul_2exp (C3, B3, 2); mpz_add (C3, C3, B1); mpz_mul_2exp (C3, C3, 1); /* C[3l .. 3l+k-1] = 8*B_3 + 2*B_1 */ } else mpz_mul_2exp (C3, B1, 1); mpz_mul_2exp (T, B2, 2); mpz_add (T, T, B0); /* T = 4*B_2 + B0 */ mpz_sub (C5, T, C3); /* C[5l .. 5l+k-1] = B(-2) */ mpz_add (C3, C3, T); /* C[3l .. 3l+k-1] = B(2) */ #ifdef DEBUG gmp_fprintf (ECM_STDOUT, "8*B(1/2)[%d] = %Zd\n", i, C1); gmp_fprintf (ECM_STDOUT, "B(2)[%d] = %Zd\n", i, C3); gmp_fprintf (ECM_STDOUT, "B(-2)[%d] = %Zd\n", i, C5); #endif } toomcook4 (t, C, C + l, l, &T); /* t0 = 8*A(1/2) * 8*B(1/2) = 64*C(1/2) */ toomcook4 (t + 2 * l - 1, C + 2 * l, C + 3 * l, l, &T); /* t2 = A(2) * B(2) = C(2) */ toomcook4 (t + 4 * l - 2, C + 4 * l, C + 5 * l, l, &T); /* t4 = A(-2) * B(-2) = C(-2) */ for (i = 0; i < l; i++) { mpz_add (C0, A0, A2); mpz_add (C1, B0, B2); if (i < k) { mpz_add (T, A1, A3); mpz_sub (C2, C0, T); /* C2 = A(-1) */ mpz_add (C0, C0, T); /* C0 = A(1) */ mpz_add (T, B1, B3); mpz_sub (C3, C1, T); /* C3 = B(-1) */ mpz_add (C1, C1, T); /* C1 = B(1) */ } else { mpz_sub (C2, C0, A1); mpz_add (C0, C0, A1); mpz_sub (C3, C1, B1); mpz_add (C1, C1, B1); } #ifdef DEBUG gmp_fprintf (ECM_STDOUT, "A(1)[%d] = %Zd\n", i, C0); gmp_fprintf (ECM_STDOUT, "A(-1)[%d] = %Zd\n", i, C2); gmp_fprintf (ECM_STDOUT, "B(1)[%d] = %Zd\n", i, C1); gmp_fprintf (ECM_STDOUT, "B(-1)[%d] = %Zd\n", i, C3); #endif } toomcook4 (C + 4 * l, C + 2 * l, C + 3 * l, l, &T); /* C4 = A(-1) * B(-1) = C(-1) */ toomcook4 (C + 2 * l, C, C + l, l, &T); /* C2 = A(1) * B(1) = C(1) */ toomcook4 (C, A, B, l, &T); /* C0 = A_0 * B_0 = C_0 */ toomcook4 (C + 6 * l, A + 3 * l, B + 3 * l, k, &T); /* C6 = A_3 * B_3 = C_6 */ for (i = 0; i < 2 * l - 1; i++) { #ifdef DEBUG gmp_fprintf (ECM_STDOUT, "C(0)[%d] = %Zd\n", i, C0); gmp_fprintf (ECM_STDOUT, "C(1)[%d] = %Zd\n", i, C2); gmp_fprintf (ECM_STDOUT, "C(-1)[%d] = %Zd\n", i, C4); gmp_fprintf (ECM_STDOUT, "C(2)[%d] = %Zd\n", i, t2); gmp_fprintf (ECM_STDOUT, "C(-2)[%d] = %Zd\n", i, t4); gmp_fprintf (ECM_STDOUT, "64*C(1/2)[%d] = %Zd\n", i, t0); if (i < 2 * k - 1) gmp_fprintf (ECM_STDOUT, "C(inf)[%d] = %Zd\n", i, C6); gmp_fprintf (ECM_STDOUT, "C_0[%d] = %Zd\n", i, C0); #endif mpz_add (t0, t0, t2); /* t0 = 65 34 20 16 20 34 65 */ mpz_sub (T, C2, C4); /* T = 2*C_odd(1) = 0 2 0 2 0 2 0 */ mpz_add (C2, C2, C4); /* C2 = 2*C_even(1) */ mpz_fdiv_q_2exp (C2, C2, 1); /* C2 = C_even(1) */ mpz_add (C4, t2, t4); /* C4 = 2*C_even(2) */ mpz_fdiv_q_2exp (C4, C4, 1); /* C4 = C_even(2) */ mpz_sub (t4, t2, t4); /* t4 = 2*C_odd(2) */ mpz_fdiv_q_2exp (t4, t4, 2); /* t4 = C_odd(2)/2 = C_1 + 4*C_3 + 16*C_5 */ mpz_fdiv_q_2exp (t2, T, 1); /* t2 = C_odd(1) */ mpz_sub (t0, t0, T); /* t0 = 65 32 20 14 20 32 65 */ mpz_mul_2exp (T, T, 4); mpz_sub (t0, t0, T); /* t0 = 65 0 20 -18 20 0 65 */ if (i < 2 * k - 1) { mpz_add (T, C0, C6); /* T = C_0 + C_6 */ mpz_sub (C2, C2, T); /* C2 = C_2 + C_4 */ mpz_sub (t0, t0, T); /* t0 = 64 0 20 -18 20 0 64 */ mpz_mul_2exp (T, T, 5); } else { mpz_sub (C2, C2, C0); /* C2 = C_2 + C_4 */ mpz_sub (t0, t0, C0); /* t0 = 64 0 20 -18 20 0 */ mpz_mul_2exp (T, C0, 5); } mpz_fdiv_q_2exp (t0, t0, 1); /* t0 = 32 0 10 -9 10 0 32 */ mpz_sub (t0, t0, T); /* t0 = 0 0 10 -9 10 0 0 */ mpz_sub (t0, t0, C2); /* t0 = 0 0 9 -9 9 0 0 */ mpz_divexact_ui (t0, t0, 9); /* t0 = 0 0 1 -1 1 0 0 */ mpz_sub (t0, C2, t0); /* t0 = C_3 */ mpz_sub (t2, t2, t0); /* t2 = C_1 + C_5 */ mpz_mul_2exp (T, t0, 2); /* T = 4*C_3 */ mpz_sub (t4, t4, T); /* t4 = C_1 + 16*C_5 */ mpz_sub (t4, t4, t2); /* t4 = 15*C_5 */ mpz_divexact_ui (t4, t4, 15); /* t4 = C_5 */ mpz_sub (t2, t2, t4); /* t2 = C_1 */ mpz_sub (C4, C4, C0); /* C4 = 4*C_2 + 16*C_4 + 64*C_6 */ mpz_fdiv_q_2exp (C4, C4, 2); /* C4 = C_2 + 4*C_4 + 16*C_6 */ if (i < 2 * k - 1) { mpz_mul_2exp (T, C6, 4); mpz_sub (C4, C4, T); /* C4 = C_2 + 4*C_4 */ } mpz_sub (C4, C4, C2); /* C4 = 3*C_4 */ mpz_divby3_1op (C4); /* C4 = C_4 */ mpz_sub (C2, C2, C4); /* C2 = C_2 */ #ifdef DEBUG gmp_fprintf (ECM_STDOUT, "C_1[%d] = %Zd\n", i, t2); gmp_fprintf (ECM_STDOUT, "C_2[%d] = %Zd\n", i, C2); gmp_fprintf (ECM_STDOUT, "C_3[%d] = %Zd\n", i, t0); gmp_fprintf (ECM_STDOUT, "C_4[%d] = %Zd\n", i, C4); gmp_fprintf (ECM_STDOUT, "C_5[%d] = %Zd\n", i, t4); if (i < 2 * k - 1) gmp_fprintf (ECM_STDOUT, "C_6[%d] = %Zd\n", i, C6); #endif } for (i = 0; i < l - 1; i++) mpz_add (C1, C1, t2); mpz_set (C1, t2); for (i = l; i < 2 * l - 1; i++) mpz_add (C1, C1, t2); for (i = 0; i < l - 1; i++) mpz_add (C3, C3, t0); mpz_set (C3, t0); for (i = l; i < 2 * l - 1; i++) mpz_add (C3, C3, t0); for (i = 0; i < l - 1; i++) mpz_add (C5, C5, t4); mpz_set (C5, t4); for (i = l; i < l + k - 1; i++) mpz_add (C5, C5, t4); } ecm-6.4.4/sp.h0000644023561000001540000003404112106741273010013 00000000000000/* sp.h - header file for the sp library Copyright 2005, 2006, 2007, 2008, 2010, 2011, 2012 Dave Newman, Jason Papadopoulos, Paul Zimmermann, Brian Gladman, Alexander Kruppa. Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2010 Free Software Foundation, Inc. (for parts from gmp-impl.h). This file is part of the SP library. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef _SP_H #define _SP_H #include "config.h" #include #ifdef HAVE_SYS_TYPES_H #include /* needed for size_t */ #endif #ifndef TUNE #include "ecm-params.h" #else extern size_t NTT_GFP_TWIDDLE_DIF_BREAKOVER; extern size_t NTT_GFP_TWIDDLE_DIT_BREAKOVER; extern size_t MUL_NTT_THRESHOLD; extern size_t PREREVERTDIVISION_NTT_THRESHOLD; extern size_t POLYINVERT_NTT_THRESHOLD; extern size_t POLYEVALT_NTT_THRESHOLD; extern size_t MPZSPV_NORMALISE_STRIDE; #endif #include #if defined( __GNUC__ ) && __GNUC__ >= 3 #define ATTRIBUTE_UNUSED __attribute__ ((unused)) #else #define ATTRIBUTE_UNUSED #endif /************** * GMP_IMPL.H * **************/ #ifdef WANT_ASSERT #include #define ASSERT(expr) assert (expr) #else #define ASSERT(expr) do {} while (0) #endif /* the following was inspired by longlong.h and gmp-impl.h; * note that a small prime must be the size of a GMP limb */ typedef mp_limb_t UWtype; typedef unsigned int UHWtype; #if (defined(_PA_RISC1_1) && defined(__GNUC__)) /* this seems to be needed, otherwise umul_ppmm() does not work properly */ typedef mp_limb_t USItype __attribute__ ((mode (SI))); typedef mp_limb_t UDItype __attribute__ ((mode (DI))); #else typedef mp_limb_t USItype; typedef mp_limb_t UDItype; #endif #ifndef W_TYPE_SIZE #define W_TYPE_SIZE GMP_LIMB_BITS #endif #ifndef ULONG_MAX #define ULONG_MAX __GMP_ULONG_MAX #endif #define LONGLONG_STANDALONE #include "longlong.h" /* we use the remainder tree for products of 2^I0_THRESHOLD moduli or more, and the naive method for fewer moduli. We must have I0_THRESHOLD >= 1. */ #define I0_THRESHOLD 7 /********* * TYPES * *********/ /* SP */ /* the type for both a small prime, and a residue modulo a small prime. * Small primes must be 1 bit smaller than the word size for 32-bit * systems (otherwise there may not be enough suitable primes), but * may be 2+ bits smaller when the word size exceeds 32 bits (and this * simplifies modular reductions) * * For a residue x modulo a sp p, we require 0 <= x < p */ typedef UWtype sp_t; #if W_TYPE_SIZE <= 32 #define SP_NUMB_BITS (W_TYPE_SIZE - 1) #else #define SP_NUMB_BITS (W_TYPE_SIZE - 2) #endif #define SP_MIN ((sp_t)1 << (SP_NUMB_BITS - 1)) #define SP_MAX ((sp_t)(-1) >> (W_TYPE_SIZE - SP_NUMB_BITS)) /* vector of residues modulo a common small prime */ typedef sp_t * spv_t; /* length of a spv */ typedef unsigned long spv_size_t; typedef struct { spv_t ntt_roots; spv_size_t twiddle_size; spv_t twiddle; } __sp_nttdata; typedef __sp_nttdata sp_nttdata_t[1]; #define MAX_NTT_BLOCK_SIZE 128 /* Which steps to perform in convolution product funtions: forward transform, pair-wise multiplication, inverse transform */ #define NTT_MUL_STEP_FFT1 1 #define NTT_MUL_STEP_FFT2 2 #define NTT_MUL_STEP_MUL 4 #define NTT_MUL_STEP_IFFT 8 /* SPM */ /* small prime modulus - this contains some precomputed constants to * calculate modulo a sp */ typedef struct { sp_t sp; /* value of the sp */ sp_t mul_c; /* constant used for reduction mod sp */ sp_t invm; /* -1/sp mod 2^GMP_NUMB_BITS */ sp_t Bpow; /* B^(n+1) mod sp where the input N has n limbs */ sp_t prim_root; sp_t inv_prim_root; sp_nttdata_t nttdata; sp_nttdata_t inttdata; spv_t scratch; } __spm_struct; typedef __spm_struct * spm_t; /* MPZSPM */ typedef mpz_t * mpzv_t; typedef struct { /* number of small primes needed to represent each coeff */ unsigned int sp_num; spv_size_t max_ntt_size; mpz_t modulus; /* spm data */ spm_t *spm; /* precomputed crt constants, see mpzspm.c */ mpzv_t crt1, crt2; sp_t *crt3, **crt4, *crt5; /* product tree to speed up conversion from mpz to sp */ mpzv_t *T; /* product tree */ unsigned int d; /* ceil(log(sp_num)/log(2)) */ } __mpzspm_struct; typedef __mpzspm_struct * mpzspm_t; /* MPZSPV */ /* sp representation of a mpz polynomial */ typedef spv_t * mpzspv_t; #define MAX(x,y) (((x)<(y))?(y):(x)) #define MIN(x,y) (((x)<(y))?(x):(y)) #define SIZ(x) ((x)->_mp_size) #define PTR(x) ((x)->_mp_d) /* expanding macros and then turning them into strings requires two levels of macro-izing */ #define _(x) #x #define STRING(x) _(x) /************* * FUNCTIONS * *************/ /* general */ static inline unsigned int ceil_log_2 (spv_size_t x) { unsigned int a = 0; x--; while (x) { a++; x >>= 1; } return a; } /* Conversion functions sp_t <-> mpz_t. Using mpz_*_ui() functions is not portable as those take unsigned long's, but on some systems (e.g. 64 bit Windows with Visual C), unsigned long has 32 bits while sp_t should use 64 */ static inline void mpz_set_sp (mpz_t m, const sp_t n) { /* Is sizeof() a safe way of determining whether the conversion is lossless? */ if (sizeof (sp_t) <= sizeof (unsigned long)) { mpz_set_ui (m, (unsigned long) n); } else if (sizeof (sp_t) == 8 && sizeof (unsigned long) == 4) { /* We want to right-shift by 32 bits on a 64 bit system here. Putting a shift amount of 32 as a constant causes a compiler warning on 32 bit systems. So we put sizeof (sp_t) * 4 which always evaluates to 32 in this branch of the code, and does not cause a compiler warning if sp_t is only 4 bytes wide. */ mpz_set_ui (m, (unsigned long) (n >> (sizeof (sp_t) * 4))); mpz_mul_2exp (m, m, 32UL); mpz_add_ui (m, m, (unsigned long int) (n & 4294967295UL)); } else { abort (); } } static inline sp_t mpz_get_sp (const mpz_t n) { if (sizeof (sp_t) == sizeof (unsigned long)) { return (sp_t) mpz_get_ui (n); } else if (sizeof (sp_t) == sizeof (mp_limb_t)) { /* mpz_get_ui() returns the least significant bits of the absolute value of its argument that fit in an unsigned long. In the current GMP implementation with sign/magnitude representation, mpz_getlimbn() also returns the least sigificant bits of the absolute value. To allow for a future change to 2's-complement representation in GMP, we should explicitly use mpz_abs() to a temp var here. */ return (sp_t) mpz_getlimbn (n, 0); } else { abort (); } } void * sp_aligned_malloc (size_t len); void sp_aligned_free (void *newptr); /* sp */ /* Routines for arithmetic on residues modulo a small prime * * All functions return values in the range 0 <= x < p. * * The variable name of the modulus is 'p' if the input must be prime, * 'm' if we also allow composites. */ static inline sp_t sp_sub(sp_t a, sp_t b, sp_t m) { #if (defined(__GNUC__) || defined(__ICL)) && \ (defined(__x86_64__) || defined(__i386__)) sp_t t = 0, tr = a; __asm__ ( "sub %2, %0 # sp_sub: tr -= b\n\t" "cmovc %3, %1 # sp_sub: if (a < b) t = m\n\t" : "+&r" (tr), "+r" (t) : "g" (b), "g" (m) : "cc" ); return tr + t; #elif defined(_MSC_VER) && !defined(_WIN64) __asm { mov eax, a xor edx, edx sub eax, b cmovb edx, m add eax, edx } #else if (a >= b) return a - b; else return a - b + m; #endif } static inline sp_t sp_add(sp_t a, sp_t b, sp_t m) { #if (defined(__GNUC__) || defined(__ICL)) && \ (defined(__x86_64__) || defined(__i386__)) sp_t t = a - m, tr = a + b; __asm__ ( "add %2, %1 # sp_add: t += b\n\t" "cmovc %1, %0 # sp_add: if (cy) tr = t \n\t" : "+r" (tr), "+&r" (t) : "g" (b) : "cc" ); return tr; #elif defined(_MSC_VER) && !defined(_WIN64) __asm { mov eax, a add eax, b mov edx, eax sub edx, m cmovnc eax, edx } #elif SP_NUMB_BITS <= W_TYPE_SIZE - 1 sp_t t = a + b; if (t >= m) t -= m; return t; #else return sp_sub(a, m - b, m); #endif } /* functions used for modular reduction */ #if SP_NUMB_BITS <= W_TYPE_SIZE - 2 /* having a small modulus allows the reciprocal * to be one bit larger, which guarantees that the * initial remainder fits in a word and also that at * most one correction is necessary */ #define sp_reciprocal(invxl,xl) \ do { \ ATTRIBUTE_UNUSED mp_limb_t dummy; \ udiv_qrnnd (invxl, dummy, \ (sp_t) 1 << (2 * SP_NUMB_BITS + 1 - \ W_TYPE_SIZE), 0, xl); \ } while (0) static inline sp_t sp_udiv_rem(sp_t nh, sp_t nl, sp_t d, sp_t di) { sp_t r; mp_limb_t q1, q2; ATTRIBUTE_UNUSED mp_limb_t tmp; q1 = nh << (2*(W_TYPE_SIZE - SP_NUMB_BITS)) | nl >> (2*SP_NUMB_BITS - W_TYPE_SIZE); umul_ppmm (q2, tmp, q1, di); r = nl - d * (q2 >> 1); return sp_sub(r, d, d); } #else /* big modulus; no shortcuts allowed */ #define sp_reciprocal(invxl,xl) \ do { \ mp_limb_t dummy; \ udiv_qrnnd (invxl, dummy, \ (sp_t) 1 << (2 * SP_NUMB_BITS - \ W_TYPE_SIZE), 0, xl); \ } while (0) static inline sp_t sp_udiv_rem(sp_t nh, sp_t nl, sp_t d, sp_t di) { mp_limb_t q1, q2, tmp, dqh, dql; q1 = nh << (2*(W_TYPE_SIZE - SP_NUMB_BITS)) | nl >> (2*SP_NUMB_BITS - W_TYPE_SIZE); umul_ppmm (q2, tmp, q1, di); umul_ppmm (dqh, dql, q2, d); tmp = nl; nl = tmp - dql; nh = nh - dqh - (nl > tmp); if (nh) nl -= d; nl = sp_sub(nl, d, d); return sp_sub(nl, d, d); } #endif /* x*y mod m */ static inline sp_t sp_mul (sp_t x, sp_t y, sp_t m, sp_t d) { sp_t u, v; umul_ppmm (u, v, x, y); return sp_udiv_rem (u, v, m, d); } /* x*y mod m */ static inline sp_t sp_sqr (sp_t x, sp_t m, sp_t d) { sp_t u, v; umul_ppmm (u, v, x, x); return sp_udiv_rem (u, v, m, d); } #define sp_neg(x,m) ((x) == (sp_t) 0 ? (sp_t) 0 : (m) - (x)) /* Returns x^a % m, uses a right-to-left powering ladder */ static inline sp_t sp_pow (sp_t x, sp_t a, sp_t m, sp_t d) { sp_t partial = 1; while (1) { if (a & 1) partial = sp_mul (x, partial, m, d); a >>= 1; if (!a) return partial; x = sp_sqr (x, m, d); } } /* 1/x mod p where d is p->mul_c */ #define sp_inv(x,p,d) sp_pow (x, (p) - 2, p, d) /* x / 2 mod m */ #define sp_div_2(x,m) (((x) & 1) ? (m) - (((m) - (x)) >> 1) : ((x) >> 1)) int sp_spp (sp_t, sp_t, sp_t); int sp_prime (sp_t); /* spm */ spm_t spm_init (spv_size_t, sp_t, mp_size_t); void spm_clear (spm_t); /* spv */ /* ASSIGNMENT */ void spv_set (spv_t, spv_t, spv_size_t); void spv_rev (spv_t, spv_t, spv_size_t); void spv_set_sp (spv_t, sp_t, spv_size_t); void spv_set_zero (spv_t, spv_size_t); /* ARITHMETIC */ /* add */ void spv_add (spv_t, spv_t, spv_t, spv_size_t, sp_t); void spv_add_sp (spv_t, spv_t, sp_t, spv_size_t, sp_t); /* subtract */ void spv_sub (spv_t, spv_t, spv_t, spv_size_t, sp_t); void spv_sub_sp (spv_t, spv_t, sp_t, spv_size_t, sp_t); void spv_neg (spv_t, spv_t, spv_size_t, sp_t); /* pointwise multiplication */ void spv_pwmul (spv_t, spv_t, spv_t, spv_size_t, sp_t, sp_t); void spv_pwmul_rev (spv_t, spv_t, spv_t, spv_size_t, sp_t, sp_t); void spv_mul_sp (spv_t, spv_t, sp_t, spv_size_t, sp_t, sp_t); void spv_random (spv_t, spv_size_t, sp_t); int spv_cmp (spv_t, spv_t, spv_size_t); /* ntt_gfp */ void spv_ntt_gfp_dif (spv_t, spv_size_t, spm_t); void spv_ntt_gfp_dit (spv_t, spv_size_t, spm_t); /* mpzspm */ spv_size_t mpzspm_max_len (mpz_t); mpzspm_t mpzspm_init (spv_size_t, mpz_t); void mpzspm_clear (mpzspm_t); /* mpzspv */ mpzspv_t mpzspv_init (spv_size_t, mpzspm_t); void mpzspv_clear (mpzspv_t, mpzspm_t); int mpzspv_verify (mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_set (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_revcopy (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_set_sp (mpzspv_t, spv_size_t, sp_t, spv_size_t, mpzspm_t); void mpzspv_from_mpzv (mpzspv_t, const spv_size_t, const mpzv_t, const spv_size_t, mpzspm_t); void mpzspv_reverse (mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_neg (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_add (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_to_mpzv (mpzspv_t, spv_size_t, mpzv_t, spv_size_t, mpzspm_t); void mpzspv_normalise (mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_pwmul (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_to_ntt (mpzspv_t, spv_size_t, spv_size_t, spv_size_t, int, mpzspm_t); void mpzspv_from_ntt (mpzspv_t, spv_size_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_mul_ntt (mpzspv_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, mpzspv_t, spv_size_t, spv_size_t, spv_size_t, int, spv_size_t, mpzspm_t, int); void mpzspv_random (mpzspv_t, spv_size_t, spv_size_t, mpzspm_t); void mpzspv_to_dct1 (mpzspv_t, mpzspv_t, spv_size_t, spv_size_t, mpzspv_t, mpzspm_t); void mpzspv_mul_by_dct (mpzspv_t, const mpzspv_t, spv_size_t, const mpzspm_t, int); void mpzspv_sqr_reciprocal (mpzspv_t, spv_size_t, const mpzspm_t); #endif /* _SP_H */ ecm-6.4.4/ecm-params.h.athlon0000644023561000001540000000113312106741273012676 00000000000000/* this is the parameter file for Opteron */ #define MPZMOD_THRESHOLD 170 #define REDC_THRESHOLD 294 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define SPV_NTT_GFP_DIF_RECURSIVE_THRESHOLD 32768 #define SPV_NTT_GFP_DIT_RECURSIVE_THRESHOLD 32768 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/listz.c0000644023561000001540000006347212106741273010543 00000000000000/* Arithmetic on lists of residues modulo n. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2012 Paul Zimmermann and Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "ecm-impl.h" #ifdef DEBUG #define ASSERTD(x) assert(x) #else #define ASSERTD(x) #endif #if (MULT == KS) #define LIST_MULT_N kronecker_schonhage #define WRAP /* use wrap-around multiplication for low short product */ #elif (MULT == TOOM4) #define LIST_MULT_N toomcook4 #elif (MULT == TOOM3) #define LIST_MULT_N toomcook3 #elif (MULT == KARA) #define LIST_MULT_N karatsuba #else #error "MULT is neither KS, TOOM4, nor TOOM3, nor KARA" #endif extern unsigned int Fermat; /* returns a bound on the auxiliary memory needed by LIST_MULT_N */ int list_mul_mem (unsigned int len) { unsigned int mem; mem = 2 * len; #if defined(TOOMCOOK3) || defined(TOOMCOOK4) while (len > 3) { mem += 2; len = (len + 2) / 3; /* ceil(len/3) */ } mem += 4; #endif return mem; } /* creates a list of n integers, return NULL if error */ listz_t init_list (unsigned int n) { listz_t p; unsigned int i; p = (mpz_t*) malloc (n * sizeof (mpz_t)); if (p == NULL) return NULL; for (i = 0; i < n; i++) mpz_init (p[i]); return p; } /* creates a list of n integers, return NULL if error. Allocates each mpz_t to the size of N bits */ listz_t init_list2 (unsigned int n, unsigned int N) { listz_t p; unsigned int i; p = (mpz_t*) malloc (n * sizeof (mpz_t)); if (p == NULL) return NULL; for (i = 0; i < n; i++) mpz_init2 (p[i], N); return p; } /* clears a list of n integers */ void clear_list (listz_t p, unsigned int n) { unsigned int i; if (p == NULL) return; for (i = 0; i < n; i++) mpz_clear (p[i]); free (p); } #ifdef DEBUG /* prints a list of n coefficients as a polynomial */ void print_list (listz_t p, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) { if (i > 0 && mpz_cmp_ui (p[i], 0) >= 0) fprintf (ECM_STDOUT, "+"); mpz_out_str (ECM_STDOUT, 10, p[i]); fprintf (ECM_STDOUT, "*x^%u", i); } fprintf (ECM_STDOUT, "\n"); } static int list_check (listz_t a, unsigned int l, mpz_t n) { unsigned int i; for (i = 0; i < l; i++) if (mpz_cmp_ui (a[i], 0) < 0 || mpz_cmp (n, a[i]) <= 0) { fprintf (ECM_STDOUT, "l=%u i=%u\n", l, i); mpz_out_str (ECM_STDOUT, 10, a[i]); fprintf (ECM_STDOUT, "\n"); return 0; } return 1; } #endif /* DEBUG */ /* Read all entries in list from stream. Return 0 on success, ECM_ERROR on error */ int list_inp_raw (listz_t a, FILE *f, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) if (mpz_inp_raw (a[i], f) == 0) return ECM_ERROR; return 0; } /* Write all entries in list to stream. Return 0 on success, ECM_ERROR on error */ int list_out_raw (FILE *f, listz_t a, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) if (mpz_out_raw (f, a[i]) == 0) return ECM_ERROR; return 0; } /* p <- q */ void list_set (listz_t p, listz_t q, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) mpz_set (p[i], q[i]); } /* p[0] <-> p[n-1], p[1] <-> p[n-2], ... */ void list_revert (listz_t p, unsigned int n) { unsigned int i; for (i = 0; i < n - 1 - i; i++) mpz_swap (p[i], p[n - 1 - i]); } void list_swap (listz_t p, listz_t q, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) mpz_swap (p[i], q[i]); } /* p <- -q, keeps residues normalized */ void list_neg (listz_t p, listz_t q, unsigned int l, mpz_t n) { unsigned int i; for (i = 0; i < l; i++) { if (mpz_sgn (q[i])) mpz_sub (p[i], n, q[i]); else mpz_set_ui (p[i], 0); } } /* p <- q modulo mod */ void list_mod (listz_t p, listz_t q, unsigned int n, mpz_t mod) { unsigned int i; for (i = 0; i < n; i++) mpz_mod (p[i], q[i], mod); } /* p <- q + r */ void list_add (listz_t p, listz_t q, listz_t r, unsigned int l) { unsigned int i; for (i = 0; i < l; i++) mpz_add (p[i], q[i], r[i]); } /* p <- q - r */ void list_sub (listz_t p, listz_t q, listz_t r, unsigned int l) { unsigned int i; for (i = 0; i < l; i++) mpz_sub (p[i], q[i], r[i]); } /* p[i] <- q[i] * r mod m */ void list_mul_z (listz_t p, listz_t q, mpz_t r, unsigned int n, mpz_t m) { unsigned int i; for (i = 0; i < n; i++) { mpz_mul (p[i], q[i], r); mpz_mod (p[i], p[i], m); } } /* p <- gcd(n, l[0]*l[1]*...*l[k-1], returns non-zero iff p is non trivial. Clobbers l[0] */ int list_gcd (mpz_t p, listz_t l, unsigned int k, mpz_t n) { unsigned int i; for (i = 1; i < k; i++) { mpz_mul (l[0], l[0], l[i]); mpz_mod (l[0], l[0], n); } mpz_gcd (p, l[0], n); return mpz_cmp_ui (p, 1); } /* Multiply up the integers in l, modulo n. Each entry becomes the product (mod n) of itself and all previous entries */ void list_mulup (listz_t l, unsigned int k, mpz_t n, mpz_t t) { unsigned int i; for (i = 1; i < k; i++) { mpz_mul (t, l[i - 1], l[i]); mpz_mod (l[i], t, n); } } /* p <- 0 */ void list_zero (listz_t p, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) mpz_set_ui (p[i], 0); } #ifndef KS_MULTIPLY /* puts in a[0]..a[K-1] the K low terms of the product of b[0..K-1] and c[0..K-1]. Assumes K >= 1, and a[0..2K-2] exist. Needs space for list_mul_mem(K) in t. */ static void list_mul_low (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t, mpz_t n) { unsigned int p, q; ASSERT(K > 0); switch (K) { case 1: mpz_mul (a[0], b[0], c[0]); return; case 2: mpz_mul (a[0], b[0], c[0]); mpz_mul (a[1], b[0], c[1]); mpz_addmul (a[1], b[1], c[0]); return; case 3: karatsuba (a, b, c, 2, t); mpz_addmul (a[2], b[2], c[0]); mpz_addmul (a[2], b[0], c[2]); return; default: /* MULT is 2 for Karatsuba, 3 for Toom3, 4 for Toom4 */ for (p = 1; MULT * p <= K; p *= MULT); /* p = greatest power of MULT <=K */ p = (K / p) * p; ASSERTD(list_check(b,p,n) && list_check(c,p,n)); LIST_MULT_N (a, b, c, p, t); if ((q = K - p)) { list_mul_low (t, b + p, c, q, t + 2 * q - 1, n); list_add (a + p, a + p, t, q); list_mul_low (t, c + p, b, q, t + 2 * q - 1, n); list_add (a + p, a + p, t, q); } } } #endif /* puts in a[K-1]..a[2K-2] the K high terms of the product of b[0..K-1] and c[0..K-1]. Assumes K >= 1, and a[0..2K-2] exist. Needs space for list_mul_mem(K) in t. */ void list_mul_high (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t) { #ifdef KS_MULTIPLY /* ks is faster */ LIST_MULT_N (a, b, c, K, t); #else unsigned int p, q; ASSERT(K > 0); switch (K) { case 1: mpz_mul (a[0], b[0], c[0]); return; case 2: mpz_mul (a[2], b[1], c[1]); mpz_mul (a[1], b[1], c[0]); mpz_addmul (a[1], b[0], c[1]); return; case 3: karatsuba (a + 2, b + 1, c + 1, 2, t); mpz_addmul (a[2], b[0], c[2]); mpz_addmul (a[2], b[2], c[0]); return; default: /* MULT is 2 for Karatsuba, 3 for Toom3, 4 for Toom4 */ for (p = 1; MULT * p <= K; p *= MULT); p = (K / p) * p; q = K - p; LIST_MULT_N (a + 2 * q, b + q, c + q, p, t); if (q) { list_mul_high (t, b + p, c, q, t + 2 * q - 1); list_add (a + K - 1, a + K - 1, t + q - 1, q); list_mul_high (t, c + p, b, q, t + 2 * q - 1); list_add (a + K - 1, a + K - 1, t + q - 1, q); } } #endif } /* Puts in a[0..2K-2] the product of b[0..K-1] and c[0..K-1]. The auxiliary memory M(K) necessary in T satisfies: M(1)=0, M(K) = max(3*l-1,2*l-2+M(l)) <= 2*K-1 where l = ceil(K/2). Assumes K >= 1. */ void karatsuba (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t) { if (K == 1) { mpz_mul (a[0], b[0], c[0]); } else if (K == 2) /* basic Karatsuba scheme */ { mpz_add (t[0], b[0], b[1]); /* t0 = b_0 + b_1 */ mpz_add (a[1], c[0], c[1]); /* a1 = c_0 + c_1 */ mpz_mul (a[1], a[1], t[0]); /* a1 = b_0*c_0 + b_0*c_1 + b_1*c_0 + b_1*c_1 */ mpz_mul (a[0], b[0], c[0]); /* a0 = b_0 * c_0 */ mpz_mul (a[2], b[1], c[1]); /* a2 = b_1 * c_1 */ mpz_sub (a[1], a[1], a[0]); /* a1 = b_0*c_1 + b_1*c_0 + b_1*c_1 */ mpz_sub (a[1], a[1], a[2]); /* a1 = b_0*c_1 + b_1*c_0 */ } else if (K == 3) { /* implement Weimerskirch/Paar trick in 6 muls and 13 adds http://www.crypto.ruhr-uni-bochum.de/Publikationen/texte/kaweb.pdf */ /* diagonal terms */ mpz_mul (a[0], b[0], c[0]); mpz_mul (a[2], b[1], c[1]); mpz_mul (a[4], b[2], c[2]); /* (0,1) rectangular term */ mpz_add (t[0], b[0], b[1]); mpz_add (t[1], c[0], c[1]); mpz_mul (a[1], t[0], t[1]); mpz_sub (a[1], a[1], a[0]); mpz_sub (a[1], a[1], a[2]); /* (1,2) rectangular term */ mpz_add (t[0], b[1], b[2]); mpz_add (t[1], c[1], c[2]); mpz_mul (a[3], t[0], t[1]); mpz_sub (a[3], a[3], a[2]); mpz_sub (a[3], a[3], a[4]); /* (0,2) rectangular term */ mpz_add (t[0], b[0], b[2]); mpz_add (t[1], c[0], c[2]); mpz_mul (t[2], t[0], t[1]); mpz_sub (t[2], t[2], a[0]); mpz_sub (t[2], t[2], a[4]); mpz_add (a[2], a[2], t[2]); } else { unsigned int i, k, l; listz_t z; k = K / 2; l = K - k; z = t + 2 * l - 1; /* improved code with 7*k-3 additions, contributed by Philip McLaughlin */ for (i = 0; i < k; i++) { mpz_sub (z[i], b[i], b[l+i]); mpz_sub (a[i], c[i], c[l+i]); } if (l > k) /* case K odd */ { mpz_set (z[k], b[k]); mpz_set (a[k], c[k]); } /* as b[0..l-1] + b[l..K-1] is stored in t[2l-1..3l-2], we need here at least 3l-1 entries in t */ karatsuba (t, z, a, l, a + l); /* fills t[0..2l-2] */ /* trick: save t[2l-2] in a[2l-1] to enable M(K) <= 2*K-1 */ z = t + 2 * l - 2; mpz_set (a[2*l-1], t[2*l-2]); karatsuba (a, b, c, l, z); /* fill a[0..2l-2] */ karatsuba (a + 2 * l, b + l, c + l, k, z); /* fills a[2l..2K-2] */ mpz_set (t[2*l-2], a[2*l-1]); /* restore t[2*l-2] */ mpz_set_ui (a[2*l-1], 0); /* l l-1 1 l 2k-1-l _________________________________________________ | a0 | a1 |0| a2 | a3 | ------------------------------------------------- l l-1 ________________________ | t0 | t1 | ------------------------ We want to replace [a1, a2] by [a1 + a0 + a2 - t0, a2 + a1 + a3 - t1] i.e. [a12 + a0 - t0, a12 + a3 - t1] where a12 = a1 + a2. */ list_add (a + 2 * l, a + 2 * l, a + l, l-1); /* a[2l..3l-1] <- a1+a2 */ if (k > 1) { list_add (a + l, a + 2 * l, a, l); /* a[l..2l-1] <- a0 + a1 + a2 */ list_add (a + 2 * l, a + 2 * l, a + 3 * l, 2 * k - 1 - l); } else /* k=1, i.e. K=2 or K=3, and a2 has only one entry */ { mpz_add (a[l], a[2*l], a[0]); if (K == 3) mpz_set (a[l+1], a[1]); } list_sub (a + l, a + l, t, 2 * l - 1); } } /* multiplies b[0]+...+b[k-1]*x^(k-1)+x^k by c[0]+...+c[l-1]*x^(l-1)+x^l and puts the results in a[0]+...+a[k+l-1]*x^(k+l-1) [the leading monomial x^(k+l) is implicit]. If monic_b (resp. monic_c) is 0, don't consider x^k in b (resp. x^l in c). Assumes k = l or k = l+1. The auxiliary array t contains at least list_mul_mem(l) entries. a and t should not overlap. */ void list_mul (listz_t a, listz_t b, unsigned int k, int monic_b, listz_t c, unsigned int l, int monic_c, listz_t t) { unsigned int i, po2; ASSERT(k == l || k == l + 1); for (po2 = l; (po2 & 1) == 0; po2 >>= 1); po2 = (po2 == 1); #ifdef DEBUG if (Fermat && !(po2 && l == k)) fprintf (ECM_STDOUT, "list_mul: Fermat number, but poly lengths %d and %d\n", k, l); #endif if (po2 && Fermat) { if (monic_b && monic_c && l == k) { F_mul (a, b, c, l, MONIC, Fermat, t); monic_b = monic_c = 0; } else F_mul (a, b, c, l, DEFAULT, Fermat, t); } else LIST_MULT_N (a, b, c, l, t); /* set a[0]...a[2l-2] */ if (k > l) /* multiply b[l]*x^l by c[0]+...+c[l-1]*x^(l-1) */ { for (i = 0; i < l - 1; i++) mpz_addmul (a[l+i], b[l], c[i]); mpz_mul (a[2*l-1], b[l], c[l-1]); } /* deal with x^k and x^l */ if (monic_b || monic_c) { mpz_set_ui (a[k + l - 1], 0); if (monic_b && monic_c) /* Single pass over a[] */ { /* a += b * x^l + c * x^k, so a[i] += b[i-l]; a[i] += c[i-k] if 0 <= i-l < k or 0 <= i-k < l, respectively */ if (k > l) mpz_add (a[l], a[l], b[0]); for (i = k; i < k + l; i++) { mpz_add (a[i], a[i], b[i-l]); /* i-l < k */ mpz_add (a[i], a[i], c[i-k]); /* i-k < l */ } } else if (monic_c) /* add b * x^l */ list_add (a + l, a + l, b, k); else /* only monic_b, add x^k * c */ list_add (a + k, a + k, c, l); } } /* Multiplies b[0..k-1] by c[0..k-1], stores the result in a[0..2k-2], and stores the reduced product in a2[0..2k-2]. (Here, there is no implicit monic leading monomial.) Requires at least list_mul_mem(k) cells in t. */ void list_mulmod (listz_t a2, listz_t a, listz_t b, listz_t c, unsigned int k, listz_t t, mpz_t n) { int i; for (i = k; (i & 1) == 0; i >>= 1); ASSERTD(list_check(b,k,n)); ASSERTD(list_check(c,k,n)); if (i == 1 && Fermat) F_mul (a, b, c, k, DEFAULT, Fermat, t); else LIST_MULT_N (a, b, c, k, t); /* set a[0]...a[2l-2] */ list_mod (a2, a, 2 * k - 1, n); } /* puts in G[0]..G[k-1] the coefficients from (x+a[0])...(x+a[k-1]) Warning: doesn't fill the coefficient 1 of G[k], which is implicit. Needs k + list_mul_mem(k/2) cells in T. G == a is allowed. T must not overlap with anything else. */ void PolyFromRoots (listz_t G, listz_t a, unsigned int k, listz_t T, mpz_t n) { unsigned int l, m; ASSERT (T != G && T != a); ASSERT (k >= 1); if (k == 1) { /* we consider x + a[0], which mean we consider negated roots */ mpz_mod (G[0], a[0], n); return; } m = k / 2; /* m >= 1 */ l = k - m; /* l >= 1 */ PolyFromRoots (G, a, l, T, n); PolyFromRoots (G + l, a + l, m, T, n); list_mul (T, G, l, 1, G + l, m, 1, T + k); list_mod (G, T, k, n); } /* puts in G[0]..G[k-1] the coefficients from (x+a[0])...(x+a[k-1]) Warning: doesn't fill the coefficient 1 of G[k], which is implicit. Needs k + list_mul_mem(k/2) cells in T. The product tree is stored in: G[0..k-1] (degree k) Tree[0][0..k-1] (degree k/2) Tree[1][0..k-1] (degree k/4), ..., Tree[lgk-1][0..k-1] (degree 1) (then we should have initially Tree[lgk-1] = a). The parameter dolvl signals that only level 'dolvl' of the tree should be computed (dolvl < 0 means all levels). Either Tree <> NULL and TreeFile == NULL, and we write the tree to memory, or Tree == NULL and TreeFile <> NULL, and we write the tree to disk. */ int PolyFromRoots_Tree (listz_t G, listz_t a, unsigned int k, listz_t T, int dolvl, mpz_t n, listz_t *Tree, FILE *TreeFile, unsigned int sh) { unsigned int l, m; listz_t H1, *NextTree; ASSERT (k >= 1); if (k == 1) { /* we consider x + a[0], which mean we consider negated roots */ mpz_mod (G[0], a[0], n); return 0; } if (Tree == NULL) /* -treefile case */ { H1 = G; NextTree = NULL; } else { H1 = Tree[0] + sh; NextTree = Tree + 1; } m = k / 2; l = k - m; if (dolvl != 0) /* either dolvl < 0 and we need to compute all levels, or dolvl > 0 and we need first to compute lower levels */ { PolyFromRoots_Tree (H1, a, l, T, dolvl - 1, n, NextTree, TreeFile, sh); PolyFromRoots_Tree (H1 + l, a + l, m, T, dolvl - 1, n, NextTree, TreeFile, sh + l); } if (dolvl <= 0) { /* Write this level to disk, if requested */ if (TreeFile != NULL) { if (list_out_raw (TreeFile, H1, l) == ECM_ERROR || list_out_raw (TreeFile, H1 + l, m) == ECM_ERROR) { outputf (OUTPUT_ERROR, "Error writing product tree of F\n"); return ECM_ERROR; } } list_mul (T, H1, l, 1, H1 + l, m, 1, T + k); list_mod (G, T, k, n); } return 0; } /* puts in q[0..K-1] the quotient of x^(2K-2) by B where B = b[0]+b[1]*x+...+b[K-1]*x^(K-1) with b[K-1]=1. */ void PolyInvert (listz_t q, listz_t b, unsigned int K, listz_t t, mpz_t n) { if (K == 1) { mpz_set_ui (q[0], 1); return; } else { int k, l, po2, use_middle_product = 0; #ifdef KS_MULTIPLY use_middle_product = 1; #endif k = K / 2; l = K - k; for (po2 = K; (po2 & 1) == 0; po2 >>= 1); po2 = (po2 == 1 && Fermat != 0); /* first determine l most-significant coeffs of Q */ PolyInvert (q + k, b + k, l, t, n); /* Q1 = {q+k, l} */ /* now Q1 * B = x^(2K-2) + O(x^(2K-2-l)) = x^(2K-2) + O(x^(K+k-2)). We need the coefficients of degree K-1 to K+k-2 of Q1*B */ ASSERTD(list_check(q+k,l,n) && list_check(b,l,n)); if (po2 == 0 && use_middle_product) { TMulKS (t, k - 1, q + k, l - 1, b, K - 1, n, 0); list_neg (t, t, k, n); } else if (po2) { list_revert (q + k, l); /* This expects the leading monomials explicitly in q[2k-1] and b[k+l-1] */ F_mul_trans (t, q + k, b, K / 2, K, Fermat, t + k); list_revert (q + k, l); list_neg (t, t, k, n); } else { LIST_MULT_N (t, q + k, b, l, t + 2 * l - 1); /* t[0..2l-1] = Q1 * B0 */ list_neg (t, t + l - 1, k, n); if (k > 1) { list_mul (t + k, q + k, l - 1, 1, b + l, k - 1, 1, t + k + K - 2); /* Q1 * B1 */ list_sub (t + 1, t + 1, t + k, k - 1); } } list_mod (t, t, k, n); /* high(1-B*Q1) */ ASSERTD(list_check(t,k,n) && list_check(q+l,k,n)); if (po2) F_mul (t + k, t, q + l, k, DEFAULT, Fermat, t + 3 * k); else LIST_MULT_N (t + k, t, q + l, k, t + 3 * k - 1); list_mod (q, t + 2 * k - 1, k, n); } } /* divides a[0]+a[1]*x+...+a[2K-1]*x^(2K-1) By b[0]+b[1]*x+...+b[K-1]*x^(K-1)+x^K i.e. a polynomial of 2K coefficients divided by a monic polynomial with K+1 coefficients (b[K]=1 is implicit). Puts the quotient in q[0]+q[1]*x+...+q[K-1]*x^(K-1) and the remainder in a[0]+a[1]*x+...+a[K-1]*x^(K-1) Needs space for list_mul_mem(K) coefficients in t. If top is non-zero, a[0]..a[K-1] are reduced mod n. */ void RecursiveDivision (listz_t q, listz_t a, listz_t b, unsigned int K, listz_t t, mpz_t n, int top) { if (K == 1) /* a0+a1*x = a1*(b0+x) + a0-a1*b0 */ { mpz_mod (a[1], a[1], n); mpz_mul (q[0], a[1], b[0]); mpz_mod (q[0], q[0], n); mpz_sub (a[0], a[0], q[0]); if (top) mpz_mod (a[0], a[0], n); mpz_set (q[0], a[1]); } else { unsigned int k, l, i, po2; k = K / 2; l = K - k; for (po2 = K; (po2 && 1) == 0; po2 >>= 1); po2 = (po2 == 1); /* first perform a (2l) / l division */ RecursiveDivision (q + k, a + 2 * k, b + k, l, t, n, 0); /* subtract q[k..k+l-1] * b[0..k-1] */ ASSERTD(list_check(q+l,k,n) && list_check(b,k,n)); if (po2 && Fermat) F_mul (t, q + l, b, k, DEFAULT, Fermat, t + K); /* sets t[0..2*k-2]*/ else LIST_MULT_N (t, q + l, b, k, t + K - 1); /* sets t[0..2*k-2] */ list_sub (a + l, a + l, t, 2 * k - 1); if (k < l) /* don't forget to subtract q[k] * b[0..k-1] */ { for (i=0; i= 2. Requires 2K-1 + list_mul_mem(K) cells in t. Notations: R = r[0..K-1], A = a[0..2K-2], low(A) = a[0..K-1], high(A) = a[K..2K-2], Q = t[0..K-2] Return non-zero iff an error occurred. */ int PrerevertDivision (listz_t a, listz_t b, listz_t invb, unsigned int K, listz_t t, mpz_t n) { int po2, wrap; listz_t t2 = NULL; #ifdef WRAP wrap = ks_wrapmul_m (K + 1, K + 1, n) <= 2 * K - 1 + list_mul_mem (K); #else wrap = 0; #endif /* Q <- high(high(A) * INVB) with a short product */ for (po2 = K; (po2 & 1) == 0; po2 >>= 1); po2 = (po2 == 1); if (Fermat && po2) { mpz_set_ui (a[2 * K - 1], 0); if (K <= 4 * Fermat) { F_mul (t, a + K, invb, K, DEFAULT, Fermat, t + 2 * K); /* Put Q in T, as we still need high(A) later on */ list_mod (t, t + K - 2, K, n); } else { F_mul (t, a + K, invb, K, DEFAULT, Fermat, t + 2 * K); list_mod (a + K, t + K - 2, K, n); } } else /* non-Fermat case */ { list_mul_high (t, a + K, invb, K - 1, t + 2 * K - 3); /* the high part of A * INVB is now in {t+K-2, K-1} */ if (wrap) { MEMORY_TAG; t2 = init_list2 (K - 1, mpz_sizeinbase (n, 2)); MEMORY_UNTAG; if (t2 == NULL) { fprintf (ECM_STDERR, "Error, not enough memory\n"); return ECM_ERROR; } list_mod (t2, t + K - 2, K - 1, n); } else /* we can store in high(A) which is no longer needed */ list_mod (a + K, t + K - 2, K - 1, n); } /* the quotient Q = trunc(A / B) has degree K-2, i.e. K-1 terms */ /* T <- low(Q * B) with a short product */ mpz_set_ui (a[2 * K - 1], 0); if (Fermat && po2) { if (K <= 4 * Fermat) { /* Multiply without zero padding, result is (mod x^K - 1) */ F_mul (t + K, t, b, K, NOPAD, Fermat, t + 2 * K); /* Take the leading monomial x^K of B into account */ list_add (t, t + K, t, K); /* Subtract high(A) */ list_sub(t, t, a + K, K); } else F_mul (t, a + K, b, K, DEFAULT, Fermat, t + 2 * K); } else /* non-Fermat case */ { #ifdef KS_MULTIPLY /* ks is faster */ if (wrap) /* Q = {t2, K-1}, B = {b, K+1} We know that Q*B vanishes with the coefficients of degree K to 2K-2 of {A, 2K-1} */ { unsigned int m; m = ks_wrapmul (t, K + 1, b, K + 1, t2, K - 1, n); clear_list (t2, K - 1); /* coefficients of degree m..2K-2 wrap around, i.e. were subtracted to 0..2K-2-m */ if (m < 2 * K - 1) /* otherwise product is exact */ list_add (t, t, a + m, 2 * K - 1 - m); } else LIST_MULT_N (t, a + K, b, K, t + 2 * K - 1); #else list_mul_low (t, a + K, b, K, t + 2 * K - 1, n); #endif } /* now {t, K} contains the low K terms from Q*B */ list_sub (a, a, t, K); list_mod (a, a, K, n); return 0; } /* Puts in inv[0..l-1] the inverses of a[0..l-1] (mod n), using 3*(l-1) multiplies and one gcdext. Returns 1 if a factor was found (stored in t), 0 otherwise. */ int list_invert (listz_t inv, listz_t a, unsigned long l, mpz_t t, mpmod_t modulus) { unsigned long i; if (l == 0) return 0; mpz_set (inv[0], a[0]); for (i = 1; i < l; i++) { mpz_mul (t, inv[i-1], a[i]); mpz_mod (inv[i], t, modulus->orig_modulus); /* inv[i] = a[0]*...*a[i] */ } mpz_gcdext (t, inv[l-1], NULL, inv[l-1], modulus->orig_modulus); if (mpz_cmp_ui (t, 1) != 0) return 1; for (i = l-1; i > 0; i--) { mpz_mul (t, inv[i], inv[i-1]); /* t = (a[0]*...*a[i])^(-1) * (a[0]*...*a[i-1]) = a[i]^(-1) */ mpz_mul (inv[i-1], inv[i], a[i]); /* inv[i-1] = (a[0]*...*a[i])^(-1) * a[i] = (a[0]*...*a[i-1])^(-1) */ mpz_mod (inv[i-1], inv[i-1], modulus->orig_modulus); mpz_mod (inv[i], t, modulus->orig_modulus); } return 0; } ecm-6.4.4/ecm-params.h.armv5tel0000644023561000001540000000116512106741274013156 00000000000000/* those parameters were obtained on gcc50.fsffrance.org with ecm-6.3-rc3, gmp-5.0.1, and gcc 4.3.2 -O2 -pedantic -fomit-frame-pointer (armv5tel-unknown-linux-gnueabi) */ #define MPZMOD_THRESHOLD 140 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 16 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 512 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 128 ecm-6.4.4/build.vc10/0000755023561000001540000000000012113421641011135 500000000000000ecm-6.4.4/build.vc10/mul_fft-params.h.x64.amd0000644023561000001540000001121212106741270015325 00000000000000#define MUL_FFT_MODF_THRESHOLD 300 #define SQR_FFT_MODF_THRESHOLD 568 #define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {401, 5 /*96*/}, {417, 4 /*98*/}, {433, 5 /*96*/}, {865, 6 /*96*/}, {897, 5 /*98*/}, {929, 6 /*96*/}, {2113, 7 /*97*/}, {2177, 6 /*98*/}, {2241, 7 /*97*/}, {2305, 6 /*98*/}, {2369, 7 /*97*/}, {3713, 8 /*93*/}, {3841, 7 /*98*/}, {4225, 8 /*94*/}, {4353, 7 /*98*/}, {4481, 8 /*94*/}, {4865, 7 /*98*/}, {4993, 8 /*95*/}, {6913, 9 /*87*/}, {7169, 8 /*96*/}, {7425, 9 /*93*/}, {7681, 8 /*96*/}, {8449, 9 /*94*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*90*/}, {11777, 8 /*97*/}, {12033, 9 /*92*/}, {13825, 10 /*87*/}, {14337, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20993, 10 /*87*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {26113, 10 /*92*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {70657, 11 /*87*/}, {71681, 10 /*98*/}, {72705, 11 /*90*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {81921, 10 /*96*/}, {82945, 11 /*85*/}, {96257, 10 /*98*/}, {97281, 12 /*75*/}, {98305, 10 /*97*/}, {101377, 12 /*78*/}, {102401, 11 /*91*/}, {110593, 12 /*87*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 11 /*98*/}, {194561, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 11 /*99*/}, {276481, 12 /*85*/}, {282625, 11 /*96*/}, {284673, 12 /*87*/}, {389121, 11 /*99*/}, {391169, 13 /*75*/}, {434177, 12 /*95*/}, {438273, 13 /*84*/}, {516097, 12 /*99*/}, {585729, 11 /*99*/}, {620545, 13 /*79*/}, {630785, 12 /*96*/}, {651265, 13 /*83*/}, {778241, 12 /*99*/}, {782337, 11 /*99*/}, {817153, 12 /*96*/}, {819201, 14 /*79*/}, {1032193, 13 /*99*/}, {1040385, 11 /*99*/}, {1046529, 12 /*94*/}, {LONG_MAX, 0}} #define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {385, 4 /*98*/}, {401, 5 /*96*/}, {801, 6 /*96*/}, {833, 5 /*98*/}, {865, 6 /*96*/}, {1729, 7 /*96*/}, {1793, 6 /*98*/}, {1857, 7 /*96*/}, {2049, 6 /*98*/}, {2113, 7 /*97*/}, {3841, 8 /*96*/}, {4097, 7 /*98*/}, {4225, 8 /*97*/}, {4609, 7 /*98*/}, {4737, 8 /*97*/}, {7169, 9 /*93*/}, {7681, 8 /*98*/}, {8449, 9 /*94*/}, {8705, 8 /*98*/}, {8961, 9 /*94*/}, {9217, 8 /*98*/}, {9473, 9 /*95*/}, {14849, 10 /*93*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20481, 10 /*95*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {24065, 10 /*92*/}, {29697, 11 /*93*/}, {30721, 10 /*96*/}, {37889, 11 /*95*/}, {38913, 10 /*97*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {52225, 11 /*92*/}, {55297, 10 /*98*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {64513, 11 /*88*/}, {79873, 12 /*83*/}, {81921, 11 /*93*/}, {88065, 12 /*91*/}, {94209, 11 /*97*/}, {104449, 12 /*81*/}, {110593, 11 /*98*/}, {112641, 12 /*87*/}, {126977, 11 /*98*/}, {137217, 12 /*85*/}, {159745, 11 /*98*/}, {161793, 12 /*83*/}, {167937, 11 /*98*/}, {169985, 12 /*87*/}, {192513, 11 /*98*/}, {194561, 12 /*85*/}, {196609, 11 /*97*/}, {202753, 12 /*89*/}, {217089, 13 /*84*/}, {221185, 12 /*98*/}, {225281, 13 /*87*/}, {253953, 12 /*98*/}, {323585, 13 /*83*/}, {385025, 12 /*98*/}, {389121, 14 /*75*/}, {393217, 12 /*93*/}, {405505, 14 /*78*/}, {507905, 13 /*98*/}, {516097, 12 /*99*/}, {552961, 13 /*85*/}, {573441, 12 /*97*/}, {577537, 13 /*88*/}, {778241, 12 /*99*/}, {782337, 13 /*85*/}, {851969, 14 /*82*/}, {868353, 13 /*95*/}, {909313, 14 /*87*/}, {1032193, 13 /*99*/}, {LONG_MAX, 0}} #define MUL_FFT_FULL_TABLE2 {{16, 1}, {4224, 2}, {4416, 6}, {4480, 2}, {4608, 4}, {4640, 2}, {4800, 1}, {5120, 2}, {5184, 1}, {5632, 2}, {5760, 1}, {6656, 4}, {6720, 1}, {7168, 4}, {7360, 1}, {7936, 4}, {8000, 2}, {8064, 1}, {8704, 2}, {8832, 6}, {8960, 3}, {9216, 1}, {13312, 6}, {14336, 3}, {15360, 5}, {16896, 6}, {17920, 1}, {19968, 2}, {20736, 1}, {21504, 2}, {23808, 1}, {28672, 4}, {29440, 2}, {29952, 1}, {33792, 2}, {35328, 1}, {36864, 4}, {37120, 1}, {49152, 4}, {49920, 1}, {50176, 3}, {53248, 1}, {55296, 2}, {59904, 3}, {61440, 1}, {65536, 2}, {70656, 6}, {71680, 2}, {72192, 5}, {73728, 4}, {79360, 1}, {81920, 2}, {82944, 1}, {86016, 2}, {89088, 1}, {90112, 2}, {95232, 1}, {100352, 5}, {110592, 1}, {114688, 4}, {117760, 1}, {131072, 2}, {144384, 5}, {147456, 4}, {158720, 1}, {161792, 3}, {163840, 2}, {190464, 1}, {196608, 4}, {199680, 3}, {212992, 1}, {262144, 6}, {272384, 7}, {294912, 6}, {301056, 4}, {322560, 1}, {327680, 3}, {344064, 2}, {380928, 1}, {385024, 2}, {387072, 1}, {393216, 7}, {425984, 6}, {444416, 5}, {466944, 1}, {520192, 2}, {577536, 7}, {589824, 6}, {602112, 4}, {645120, 3}, {688128, 2}, {774144, 1}, {786432, 6}, {788480, 4}, {808960, 5}, {811008, 2}, {817152, 3}, {819200, 5}, {823296, 2}, {829440, 1}, {1048576, 2}, {1069056, 1}, {1073152, 5}, {1081344, 3}, {1089536, 2}, {LONG_MAX, 1}} ecm-6.4.4/build.vc10/ecm-params.h0000644023561000001540000000072412106741271013264 00000000000000#define MPZMOD_THRESHOLD 170 #define REDC_THRESHOLD 294 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/build.vc10/ecm-params.h.x64.amd0000644023561000001540000000122212106741271014436 00000000000000/* updated 03 Jan 2012 on frite.loria.fr (AMD Phenom(tm) II X2 B55 Processor) for ecm-6.4 with GMP 5.0.2 */ #define TUNE_MULREDC_THRESH 10 #define TUNE_SQRREDC_THRESH 1 #define MPZMOD_THRESHOLD 103 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 8, 10, 11, 11, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 16, 18, 18, 18, 20} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 12 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 128 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/build.vc10/tune/0000755023561000001540000000000012113421641012110 500000000000000ecm-6.4.4/build.vc10/tune/tune.vcxproj.filters0000644023561000001540000000724312106741270016102 00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hpp;hxx;hm;inl;inc;xsd {38f1a18f-40fc-4eed-a68e-e79b58327b6c} Source Files\Assembler Source Files\Assembler Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files ecm-6.4.4/build.vc10/tune/Makefile.am0000644023561000001540000000005712106741270014073 00000000000000EXTRA_DIST = tune.vcxproj tune.vcxproj.filters ecm-6.4.4/build.vc10/tune/tune.vcxproj0000644023561000001540000001735712106741270014442 00000000000000 Release Win32 Release x64 {80E08750-5C6C-492E-BB1E-7200978AE125} tune Win32Proj Application Unicode true Application NotSet <_ProjectFileVersion>10.0.30128.1 $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false MaxSpeed true ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) MultiThreaded true Level3 ProgramDatabase ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);%(AdditionalDependencies) true Console true true MachineX86 X64 MaxSpeed true ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_WIN64;NDEBUG;_CONSOLE;TUNE;%(PreprocessorDefinitions) MultiThreaded true Level3 ProgramDatabase ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);%(AdditionalDependencies) true Console true true MachineX64 _WIN64 TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) TUNE_MULREDC_THRESH#0;TUNE_SQRREDC_THRESH#0;%(PreprocessorDefinitions) {cd555681-d65b-4173-a29c-b8bf06a4871b} ecm-6.4.4/build.vc10/assembler/0000755023561000001540000000000012113421641013112 500000000000000ecm-6.4.4/build.vc10/assembler/test_mulredc.c0000644023561000001540000001441612106741270015703 00000000000000#include #include #include #include #include "asmredc.h" void mp_print(mp_limb_t *x, int N) { int i; for (i = 0; i < N-1; ++i) printf("%lu + W*(", x[i]); printf("%lu", x[N-1]); for (i = 0; i < N-1; ++i) printf(")"); printf("\n"); } static mp_limb_t call_mulredc (int N, mp_limb_t *z, mp_limb_t *x, mp_limb_t *y, mp_limb_t *m, mp_limb_t invm) { mp_limb_t cy; switch (N) { case 1: cy = mulredc1(z, x[0], y[0], m[0], invm); break; case 2: cy = mulredc2(z, x, y, m, invm); break; case 3: cy = mulredc3(z, x, y, m, invm); break; case 4: cy = mulredc4(z, x, y, m, invm); break; case 5: cy = mulredc5(z, x, y, m, invm); break; case 6: cy = mulredc6(z, x, y, m, invm); break; case 7: cy = mulredc7(z, x, y, m, invm); break; case 8: cy = mulredc8(z, x, y, m, invm); break; case 9: cy = mulredc9(z, x, y, m, invm); break; case 10: cy = mulredc10(z, x, y, m, invm); break; case 11: cy = mulredc11(z, x, y, m, invm); break; case 12: cy = mulredc12(z, x, y, m, invm); break; case 13: cy = mulredc13(z, x, y, m, invm); break; case 14: cy = mulredc14(z, x, y, m, invm); break; case 15: cy = mulredc15(z, x, y, m, invm); break; case 16: cy = mulredc16(z, x, y, m, invm); break; case 17: cy = mulredc17(z, x, y, m, invm); break; case 18: cy = mulredc18(z, x, y, m, invm); break; case 19: cy = mulredc19(z, x, y, m, invm); break; case 20: cy = mulredc20(z, x, y, m, invm); break; default: cy = mulredc20(z, x, y, m, invm); } return cy; } void test(mp_size_t N, int k) { mp_limb_t *x, *y, *yp, *z, *m, invm, cy, cy2, *tmp, *tmp2, *tmp3; int i, j; x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); z = (mp_limb_t *) malloc((N+1)*sizeof(mp_limb_t)); m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); tmp2 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); tmp3 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); if (x == NULL || y == NULL || z == NULL || m == NULL || tmp == NULL || tmp2 == NULL || tmp3 == NULL) { fprintf (stderr, "Cannot allocate memory in test_mulredc\n"); exit (1); } mpn_random2(m, N); m[0] |= 1UL; if (m[N-1] == 0) m[N-1] = 1UL; invm = 1UL; for (i = 0; i < 10; ++i) invm = (2*invm-m[0]*invm*invm); invm = -invm; assert( (invm*m[0] +1UL) == 0UL); yp = y; for (i=0; i < k; ++i) { /* Try a few special cases */ if (i == 0) { /* Try all 0, product should be 0 */ for (j = 0; j < N; j++) x[j] = y[j] = 0; } else if (i == 1) { /* Try all 1 */ for (j = 0; j < N; j++) x[j] = y[j] = 1; } else if (i == 2) { /* Try all 2^wordsize - 1 */ for (j = 0; j < N; j++) x[j] = y[j] = ~(0UL); } else { /* In the other cases, try random data */ if (i % 2 == 0) { /* Try squaring */ mpn_random2(x, N); yp = x; } else { /* Try multiplication */ mpn_random2(x, N); mpn_random2(y, N); } } // Mul followed by ecm_redc3 mpn_mul_n(tmp, x, yp, N); ecm_redc3(tmp, m, N, invm); cy2 = mpn_add_n (tmp2, tmp + N, tmp, N); // Mixed mul and redc cy = call_mulredc (N, z, x, yp, m, invm); if (cy != cy2) printf ("i = %d: mulredc cy = %ld, mpn_mul_n/ecm_redc3 cy = %ld\n", i, (long) cy, (long) cy2); assert (cy == cy2); if (mpn_cmp(z,tmp2, N) != 0) { printf ("i = %d\nmulredc = ", i); for (j = N - 1; j >= 0; j--) printf ("%lx ", z[j]); printf ("\nmpn_mul_n/ecm_redc3 = "); for (j = N - 1; j >= 0; j--) printf ("%lx ", tmp2[j]); printf ("\n"); assert (mpn_cmp(z,tmp2, N) == 0); } if (cy) printf("!"); z[N] = cy; // Check with pure gmp : multiply by 2^(N*GMP_NUMB_BITS) and compare. for (j=0; j < N; ++j) { tmp[j] = 0; tmp[j+N] = z[j]; } tmp[2*N] = z[N]; mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N+1, m, N); for (j=0; j < N; ++j) z[j] = tmp3[j]; mpn_mul_n(tmp, x, yp, N); mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N, m, N); assert(mpn_cmp(z, tmp3, N) == 0); } free(tmp); free(tmp2); free(tmp3); free(x); free(y); free(z); free(m); } int main(int argc, char** argv) { int i, len; if (argc > 1) /* Test a specific length */ { len = atoi (argv[1]); for (i = 0; i < 1; i++) test (len, 1000000); return 0; } for (;;) { for (i = 1; i <= 20; ++i) { test(i, 1000); } #if 0 test(1, 1000); test(2, 1000); test(3, 1000); test(4, 1000); test(5, 1000); test(6, 1000); test(7, 1000); test(8, 1000); test(9, 1000); test(10, 1000); test(11, 1000); test(12, 1000); test(13, 100); test(14, 100); test(15, 100); test(16, 100); test(17, 100); test(18, 100); test(44, 10); test(45, 10); test(46, 10); test(47, 10); test(48, 10); test(49, 10); #endif printf("."); fflush(stdout); } #if 0 x[0] = 12580274668139321508UL; x[1] = 9205793975152560417UL; x[2] = 7857372727033793057UL; y[0] = 13688385828267279103UL; y[1] = 10575011835742767258UL; y[2] = 8802048318027595690UL; m[0] = 2981542467342508025UL; m[1] = 5964669706257742025UL; m[2] = 18446744073678090270UL; invm = 9419286575570128311UL; carry = mulredc(z, x, y, m, 3, invm); printf("%lu + 2^64*(%lu + 2^64*%lu), carry=%lu\n", z[0], z[1], z[2], carry); #endif return 0; } #if 0 W := 2^64; x0:= 12580274668139321508; x1:= 9205793975152560417; x2:= 7857372727033793057; x := x0 + W*(x1 + W*x2); y0:= 13688385828267279103; y1:= 10575011835742767258; y2:= 8802048318027595690; y := y0 + W*(y1 + W*y2); m0:= 2981542467342508025; m1:= 5964669706257742025; m2:= 18446744073678090270; m := m0 + W*(m1 + W*m2); invm := 9419286575570128311; #endif ecm-6.4.4/build.vc10/assembler/a_win32a_mulredc.asm0000644023561000001540000000627712106741270016673 00000000000000 ; Part of GMP-ECM ; ; mp_limb_t mulredc1( 1 limb ; mp_limb_t *z, ; const mp_limb_t x, ; const mp_limb_t y, ; const mp_limb_t m, ; mp_limb_t inv_m ; ) ; ; mp_limb_t mulredc( > 1 limb ; mp_limb_t *z, ; const mp_limb_t *x, ; const mp_limb_t *y, ; const mp_limb_t *m, ; mp_limb_t inv_m ; ) %macro mseq_1 3 mul ebp add [edi+4*%3], %2 mov %2, 0 adc %1, eax mov eax, [esi+4*%3+8] adc %2, edx %endmacro %macro mseq_2 3 mul ebp add [edi+3*%3], %1 mov %1, 0 adc %1, eax mov eax, [esi+4*%3+8] adc %2, edx %endmacro %macro mulredc 1 %assign limbs %1 %define f_name(x) _mulredc %+ x global f_name(limbs) %ifdef DLL export f_name(limbs) %endif f_name(limbs): push ebp push edi push esi push ebx sub esp, 8*(limbs+1) mov edi, esp %assign i 0 %rep 2 * limbs + 1 mov dword [edi+4*i], 0 %assign i i + 1 %endrep mov dword [esp+8*limbs+4], limbs ; align 32 .1: mov eax, [esp+8*limbs+32] mov esi, [esp+8*limbs+36] mov eax, [eax] mul dword [esi] add eax, [edi] mul dword [esp+8*limbs+44] mov ebp, eax mov esi, [esp+8*limbs+40] mov eax, [esi] mul ebp mov ebx, eax mov ecx, edx mov eax, [esi+4] %assign i 0 %rep limbs - 2 %if (i & 1) mseq_1 ebx, ecx, i %else mseq_1 ecx, ebx, i %endif %assign i i + 1 %endrep mul ebp %if (limbs & 1) add [edi+4*limbs-8], ecx adc eax, ebx %else add [edi+4*limbs-8], ebx adc eax, ecx %endif adc edx, 0 add [edi+4*limbs-4], eax adc edx, 0 add [edi+4*limbs], edx adc dword [edi+4*limbs+4], 0 mov eax, [esp+8*limbs+32] mov ebp, [eax] mov esi, [esp+8*limbs+36] mov eax, [esi] mul ebp mov ebx, eax mov ecx, edx mov eax, [esi+4] %assign i 0 %rep limbs - 2 %if (i & 1) mseq_1 ebx, ecx, i %else mseq_1 ecx, ebx, i %endif %assign i i + 1 %endrep mul ebp %if (limbs & 1) add [edi+4*limbs-8], ecx adc eax, ebx %else add [edi+4*limbs-8], ebx adc eax, ecx %endif adc edx, 0 add [edi+4*limbs-4], eax adc edx, 0 add [edi+4*limbs],edx adc dword [edi+4*limbs+4], 0 add dword [esp+8*limbs+32], 4 add edi, 4 dec dword [esp+8*limbs+4] jnz .1 mov ebx, [esp+8*limbs+28] %assign i 0 %rep limbs mov eax, [edi+4*i] mov [ebx+4*i], eax %assign i i + 1 %endrep mov eax, [edi+4*limbs] add esp, 8*(limbs+1) pop ebx pop esi pop edi pop ebp ret %endmacro text global _mulredc1 _mulredc1: mov eax, [esp+12] mul dword [esp+8] mov [esp+12], edx mov [esp+8], eax mul dword [esp+20] mul dword [esp+16] add eax, [esp+8] adc edx, [esp+12] mov ecx, [esp+4] mov [ecx], edx adc eax,0 ret %assign i 2 %rep 19 ; 3..20 inclusive mulredc i %assign i i + 1 %endrep end ecm-6.4.4/build.vc10/assembler/mulredc.h0000644023561000001540000000506512106741270014651 00000000000000#ifndef __ASM_REDC_H__ #define __ASM_REDC_H__ #include extern void ecm_redc3(mp_limb_t *cp, const mp_limb_t *np, mp_size_t nn, mp_limb_t Nprim); /* WARNING: the size-1 version doesn't take pointers in input */ extern mp_limb_t mulredc1(mp_limb_t *z, mp_limb_t x, mp_limb_t y, mp_limb_t m, mp_limb_t inv_m); extern mp_limb_t mulredc2(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc3(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc4(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc5(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc6(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc7(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc8(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc9(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc10(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc11(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc12(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc13(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc14(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc15(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc16(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc17(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc18(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc19(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); extern mp_limb_t mulredc20(mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, mp_limb_t inv_m); #endif ecm-6.4.4/build.vc10/assembler/mulredc.asm0000644023561000001540000000021712106741270015174 00000000000000 %ifdef _WIN64 %include "a_x64_mulredc.asm" %elif AMD_ASM %include "a_win32a_mulredc.asm" %else %include "a_win32p_mulredc.asm" %endif ecm-6.4.4/build.vc10/assembler/a_x64_mulredc.asm0000644023561000001540000001071112106741270016175 00000000000000; ; Part of GMP-ECM ; ; mp_limb_t mulredc1( MSVC 1 limb ; mp_limb_t *z, rcx ; const mp_limb_t x, rdx ; const mp_limb_t y, r8 ; const mp_limb_t m, r9 ; mp_limb_t inv_m [rsp+0x28] ; ) ; ; mp_limb_t mulredc( MSVC > 1 limb ; mp_limb_t *z, rcx ; const mp_limb_t *x, rdx ; const mp_limb_t *y, r8 ; const mp_limb_t *m, r9 ; mp_limb_t inv_m [rsp+0x28] ; ) %macro mseq_1 4 mov %2, rcx mul r14 add %1, rax mov rax, [r9+8*%3] adc %2, rdx mul r11 %if %3 < %4 - 1 add rax, %1 mov [rbp+8*(%3-1)], rax mov rax, [r8+8*(%3+1)] adc %2, rdx setc cl %else add %1, rax mov [rbp+8*(%3-1)], %1 adc %2, rdx mov [rbp+8*%3], %2 setc cl mov [rbp+8*(%3+1)], rcx %endif %endmacro %macro mseq_20 2 mov r14, [r13+r12*8] mov rax, [r8] mov %1, [rbp] mov %2, [rbp+8] mul r14 add r12, 1 add rax, %1 adc %2, rdx setc cl mov %1, rax imul rax, r10 mov r11, rax mul qword [r9] add %1, rax adc %2, rdx mov rax, [r8+8] %endmacro %macro mseq_2 4 mov %2, [rbp+8*(%3+1)] adc %2, rcx %if %3 < %4 - 1 setc cl %endif mul r14 add %1, rax mov rax, [r9+8*%3] adc %2, rdx %if %3 < %4 - 1 adc cl, 0 %else setc cl %endif mul r11 %if %3 < %4 - 1 add rax, %1 mov [rbp+8*(%3-1)], rax adc %2, rdx mov rax, [r8+8*(%3+1)] %else add %1, rax mov [rbp+8*(%3-1)], %1 adc %2, rdx mov [rbp+8*%3],%2 adc cl, 0 mov [rbp+8*(%3+1)], rcx %endif %endmacro %macro store 1 %assign i 0 %rep %1 %if i == %1 - 1 && (%1 & 1) mov rax, [rbp+8*i] mov [rdi+8*i], rax %elif (i & 1) mov [rdi+8*(i-1)], rax mov [rdi+8*i], rdx %else mov rax, [rbp+8*i] mov rdx, [rbp+8*(i+1)] %endif %assign i i + 1 %endrep %endmacro %macro mulredc 1 %assign limbs %1 %define f_name(x) mulredc %+ x %define stack_space 8 * (limbs + 1 + (limbs & 1)) global f_name(limbs) %ifdef DLL export f_name(limbs) %endif align 64 PROC_FRAME f_name(limbs) ; SEH Frame push_reg rbp push_reg rbx push_reg rsi push_reg rdi push_reg r12 push_reg r13 push_reg r14 alloc_stack stack_space END_PROLOGUE ; *y in r8 mov rdi, rcx ; *z -> rdi mov r13, rdx ; *x -> r13 mov r10, [rsp+8*12+stack_space] ; invm -> r10 ; *m in r9 mov r14, [r13] mov rax, [r8] xor rcx, rcx lea rbp, [rsp] mov r12, rcx mul qword r14 add r12, 1 mov rsi, rax mov rbx, rdx imul rax, r10 mov r11, rax mul qword [r9] add rsi, rax mov rax, [r8+8] adc rbx, rdx setc cl %assign j 1 %rep limbs - 1 %if (j & 1) mseq_1 rbx, rsi, j, limbs %else mseq_1 rsi, rbx, j, limbs %endif %assign j j + 1 %endrep align 32 .1: %assign j 1 %if (limbs & 1) mseq_20 rsi, rbx %rep limbs - 1 %if (j & 1) mseq_2 rbx, rsi, j, limbs %else mseq_2 rsi, rbx, j, limbs %endif %assign j j + 1 %endrep %else mseq_20 rbx, rsi %rep limbs - 1 %if (j & 1) mseq_2 rsi, rbx, j, limbs %else mseq_2 rbx, rsi, j, limbs %endif %assign j j + 1 %endrep %endif cmp r12, limbs jb .1 store limbs mov rax, rcx add rsp, stack_space pop r14 pop r13 pop r12 pop rdi pop rsi pop rbx pop rbp ret ENDPROC_FRAME %endmacro bits 64 section .text global mulredc1 %ifdef DLL export mulredc1 %endif align 64 mulredc1: mov rax, r8 mul rdx mov r10, rax mov r11, rdx mul qword [rsp+0x28] mul r9 add rax, r10 adc rdx, r11 mov [rcx], rdx adc rax, 0 ret %assign i 2 %rep 19 ; 2..20 inclusive mulredc i %assign i i + 1 %endrep end ecm-6.4.4/build.vc10/assembler/a_win32p_redc.asm0000644023561000001540000000523512106741270016165 00000000000000; ; Part of GMP-ECM ; ; void ecm_redc3( ; mp_limb_t *z, rdi r8 <- rcx ; const mp_limb_t *x, rsi r9 <- rdx ; size_t n, rdx r10 <- r8 ; mp_limb_t m rcx r11 <- r9 ; ) %macro rloop 3 mov eax, [byte esi+4*%3] mul ebp add [byte edi+4*%3], %2 adc %1, eax mov %2, edx adc %2, 0 %endmacro bits 32 section .text global _ecm_redc3 %ifdef DLL export _ecm_redc3 %endif _ecm_redc3: push ebp push edi push esi push ebx sub esp, 16 mov ecx, [esp+44] mov edi, [esp+36] mov [esp], ecx cmp ecx, 5 jae .unroll .1: mov ebp, [esp+48] mov esi, [esp+40] imul ebp, [edi] mov [esp+36], edi mov ecx, [esp+44] xor ebx, ebx .2: mov eax, [esi] add edi, 4 mul ebp add esi, 4 add eax, ebx adc edx, 0 add [edi-4], eax adc edx, 0 dec ecx mov ebx, edx jnz .2 mov edi, [esp+36] mov [edi], ebx dec dword [esp] lea edi, [edi+4] jnz .1 add esp, 16 pop ebx pop esi pop edi pop ebp ret .unroll: mov edx, ecx dec ecx sub edx, 2 neg ecx shr edx, 4 and ecx, 15 mov [esp+8], edx mov edx, ecx shl edx, 4 neg ecx lea edx, [edx+ecx*1+.loop_base] mov [esp+44], ecx mov [esp+12], edx .4: mov ebp, [esp+48] mov esi, [esp+40] imul ebp, [edi] mov [esp+36], edi mov ecx, [esp+44] mov edx, [esp+8] mov [esp+4], edx mov eax, [esi] lea esi, [esi+ecx*4+4] mul ebp lea edi, [edi+ecx*4] mov ebx, edx mov edx, [esp+12] test ecx, 1 mov ecx, eax cmovnz ecx, ebx cmovnz ebx, eax jmp edx align 32 .5: add edi, 64 .loop_base: rloop ebx, ecx, 0 rloop ecx, ebx, 1 rloop ebx, ecx, 2 rloop ecx, ebx, 3 rloop ebx, ecx, 4 rloop ecx, ebx, 5 rloop ebx, ecx, 6 rloop ecx, ebx, 7 rloop ebx, ecx, 8 rloop ecx, ebx, 9 rloop ebx, ecx, 10 rloop ecx, ebx, 11 rloop ebx, ecx, 12 rloop ecx, ebx, 13 rloop ebx, ecx, 14 rloop ecx, ebx, 15 dec dword [esp+4] lea esi, [esi+64] jns .5 add [edi+64], ecx mov edi, [esp+36] adc ebx, 0 mov [edi], ebx dec dword [esp] lea edi, [edi+4] jnz .4 add esp, 16 pop ebx pop esi pop edi pop ebp ret end ecm-6.4.4/build.vc10/assembler/redc.asm0000644023561000001540000000020412106741270014452 00000000000000%ifdef _WIN64 %include "a_x64_redc.asm" %elif AMD_ASM %include "a_win32a_redc.asm" %else %include "a_win32p_redc.asm" %endif ecm-6.4.4/build.vc10/assembler/a_win32p_mulredc.asm0000644023561000001540000000472712106741270016710 00000000000000 ; Part of GMP-ECM ; ; mp_limb_t mulredc1( 1 limb ; mp_limb_t *z, ; const mp_limb_t x, ; const mp_limb_t y, ; const mp_limb_t m, ; mp_limb_t inv_m ; ) ; ; mp_limb_t mulredc( > 1 limb ; mp_limb_t *z, ; const mp_limb_t *x, ; const mp_limb_t *y, ; const mp_limb_t *m, ; mp_limb_t inv_m ; ) %macro mseq 1 movd mm1, [esi+4*%1] movd mm2, [edi+4*%1] pmuludq mm1, mm7 paddq mm2, mm1 paddq mm0, mm2 movd [edi+4*%1], mm0 psrlq mm0, 32 %endmacro %macro mulredc 1 %assign limbs %1 %define f_name(x) _mulredc %+ x global f_name(limbs) %ifdef DLL export f_name(limbs) %endif f_name(limbs): push ebp push edi push esi push ebx sub esp, 8*(limbs+1) mov edi, esp %assign i 0 %rep 2 * limbs + 1 mov dword [edi+4*i], 0 %assign i i + 1 %endrep mov dword [esp+8*limbs+4], limbs align 32 .1: mov eax, [esp+8*limbs+32] mov esi, [esp+8*limbs+36] mov eax, [eax] mul dword [esi] add eax, [edi] mul dword [esp+8*limbs+44] mov ebp, eax mov esi, [esp+8*limbs+40] pxor mm0, mm0 movd mm7, ebp %assign i 0 %rep limbs mseq i %assign i i + 1 %endrep movd ecx, mm0 add [edi+4*limbs], ecx adc dword [edi+4*limbs+4], 0 mov eax, [esp+8*limbs+32] mov ebp, [eax] mov esi, [esp+8*limbs+36] pxor mm0, mm0 movd mm7, ebp %assign i 0 %rep limbs mseq i %assign i i + 1 %endrep movd ecx, mm0 add [edi+4*limbs], ecx adc dword [edi+4*limbs+4], 0 add dword [esp+8*limbs+32], 4 add edi, 4 dec dword [esp+8*limbs+4] jnz .1 mov ebx, [esp+8*limbs+28] %assign i 0 %rep limbs mov eax, [edi+4*i] mov [ebx+4*i], eax %assign i i + 1 %endrep mov eax, [edi+4*limbs] add esp, 8*(limbs+1) pop ebx pop esi pop edi pop ebp emms ret %endmacro bits 32 section .text global _mulredc1 %ifdef DLL export _mulredc1 %endif _mulredc1: mov eax, [esp+12] mul dword [esp+8] mov [esp+12], edx mov [esp+8], eax mul dword [esp+20] mul dword [esp+16] add eax, [esp+8] adc edx, [esp+12] mov ecx, [esp+4] mov [ecx], edx adc eax, 0 ret %assign i 2 %rep 19 ; 3..20 inclusive mulredc i %assign i i + 1 %endrep end ecm-6.4.4/build.vc10/assembler/Makefile.am0000644023561000001540000000031012106741270015065 00000000000000EXTRA_DIST = a_win32a_mulredc.asm a_win32a_redc.asm a_win32p_mulredc.asm \ a_win32p_redc.asm a_x64_mulredc.asm a_x64_redc.asm \ test_mulredc.c mulredc.h mulredc.asm redc.asm ecm-6.4.4/build.vc10/assembler/a_x64_redc.asm0000644023561000001540000000557612106741270015474 00000000000000; ; Part of GMP-ECM ; ; void ecm_redc3( ; mp_limb_t *z, rdi r8 <- rcx ; const mp_limb_t *x, rsi r9 <- rdx ; size_t n, rdx r10 <- r8 ; mp_limb_t m rcx r11 <- r9 ; ) %macro rloop 3 mov rax,[byte rsi+8*%3] mul rbp add [byte rdi+8*%3], %1 adc %2, rax mov %1, rdx adc %1, 0 %endmacro bits 64 section .text global ecm_redc3 %ifdef DLL export ecm_redc3 %endif PROC_FRAME ecm_redc3 push_reg rbp push_reg rbx push_reg rsi push_reg rdi alloc_stack 5*8 END_PROLOGUE mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, rdi mov r9, rsi mov r10, rdx mov r11, rcx mov rcx, r10 mov [rsp], rcx cmp rcx, 3 jae .unroll .1: mov rbp, r11 mov rsi, r9 imul rbp, [rdi] mov r8, rdi mov rcx, r10 xor rbx, rbx .2: mov rax, [rsi] add rdi, 8 mul rbp add rsi, 8 add rax, rbx adc rdx, 0 add [rdi-8], rax adc rdx, 0 dec rcx mov rbx, rdx jnz .2 mov rdi, r8 mov [rdi], rbx dec qword [rsp] lea rdi, [rdi+8] jnz .1 add rsp, 5*8 pop rdi pop rsi pop rbx pop rbp ret .unroll: mov rdx, rcx dec rcx sub rdx, 2 neg rcx shr rdx, 4 and rcx, 15 mov [rsp+16], rdx mov rdx, rcx shl rdx, 4 lea r10, [.loop_base wrt rip] add rdx, r10 lea rdx, [rdx+rcx*4] add rdx, rcx neg rcx mov r10, rcx mov [rsp+24], rdx .4: mov rbp, r11 mov rsi, r9 imul rbp, [rdi] mov r8, rdi mov rcx, r10 mov rdx, [rsp+16] mov [rsp+8], rdx mov rax, [rsi] lea rsi, [rsi+rcx*8+8] mul rbp lea rdi, [rdi+rcx*8] mov rbx, rdx mov rdx, [rsp+24] test rcx, 1 mov rcx, rax cmovnz rcx, rbx cmovnz rbx, rax jmp rdx align 64 .5: add rdi, 128 .loop_base: rloop rcx, rbx, 0 rloop rbx, rcx, 1 rloop rcx, rbx, 2 rloop rbx, rcx, 3 rloop rcx, rbx, 4 rloop rbx, rcx, 5 rloop rcx, rbx, 6 rloop rbx, rcx, 7 rloop rcx, rbx, 8 rloop rbx, rcx, 9 rloop rcx, rbx, 10 rloop rbx, rcx, 11 rloop rcx, rbx, 12 rloop rbx, rcx, 13 rloop rcx, rbx, 14 rloop rbx, rcx, 15 dec qword [rsp+8] lea rsi, [rsi+128] jns .5 add [rdi+128], rcx mov rdi, r8 adc rbx, 0 mov [rdi], rbx dec qword [rsp] lea rdi, [rdi+8] jnz .4 add rsp, 5*8 pop rdi pop rsi pop rbx pop rbp ret ENDPROC_FRAME end ecm-6.4.4/build.vc10/assembler/a_win32a_redc.asm0000644023561000001540000000503512106741270016144 00000000000000; ; Part of GMP-ECM ; ; void ecm_redc3( ; mp_limb_t *z, rdi r8 <- rcx ; const mp_limb_t *x, rsi r9 <- rdx ; size_t n, rdx r10 <- r8 ; mp_limb_t m rcx r11 <- r9 ; ) %macro seq 3 mov eax, [byte esi+4*%3] mul ebp add [byte edi+4*%3], %2 adc %1, eax mov %2, edx adc %2, 0 %endmacro text global _ecm_redc3 _ecm_redc3: push ebp push edi push esi push ebx sub esp, 16 mov ecx, [esp+44] mov edi, [esp+36] mov [esp], ecx cmp ecx, 5 jae .3 .1: mov ebp, [esp+48] mov esi, [esp+40] imul ebp, [edi] mov [esp+36], edi mov ecx, [esp+44] xor ebx, ebx .2: mov eax, [esi] add edi, 4 mul ebp add esi, 4 add eax, ebx adc edx, 0 add [edi-4], eax adc edx, 0 dec ecx mov ebx, edx jnz .2 mov edi, [esp+36] mov [edi], ebx dec dword [esp] lea edi, [edi+4] jnz .1 add esp, 16 pop ebx pop esi pop edi pop ebp ret .3: mov edx, ecx dec ecx sub edx, 2 neg ecx shr edx, 4 and ecx, 15 mov [esp+8], edx mov edx, ecx shl edx, 4 neg ecx lea edx, [edx+ecx+.6] mov [esp+44], ecx mov [esp+12], edx .4: mov ebp, [esp+48] mov esi, [esp+40] imul ebp, [edi] mov [esp+36], edi mov ecx, [esp+44] mov edx, [esp+8] mov [esp+4], edx mov eax, [esi] lea esi, [esi+ecx*4+4] mul ebp lea edi, [edi+ecx*4] mov ebx, edx mov edx, [esp+12] test ecx, 1 mov ecx, eax cmovnz ecx, ebx cmovnz ebx, eax jmp edx align 32 .5: add edi, 64 .6: %assign i 0 %rep 16 %if (i & 1) seq ecx, ebx, i %else seq ebx, ecx, i %endif %assign i i + 1 %endrep dec dword [esp+4] lea esi, [esi+64] jns .5 add [edi+64], ecx mov edi, [esp+36] adc ebx, 0 mov [edi], ebx dec dword [esp] lea edi, [edi+4] jnz .4 add esp, 16 pop ebx pop esi pop edi pop ebp ret end ecm-6.4.4/build.vc10/mul_fft-params.h.win32.amd0000644023561000001540000000017512106741271015655 00000000000000/* Empty file so that #include won't produce an error message. With no parameters defined, mul_fft.c will use defaults. */ecm-6.4.4/build.vc10/ecm/0000755023561000001540000000000012113421641011701 500000000000000ecm-6.4.4/build.vc10/ecm/ecm.vcxproj.filters0000644023561000001540000000371012106741270015457 00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hpp;hxx;hm;inl;inc;xsd Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Header Files Header Files Header Files Header Files Header Files Header Files ecm-6.4.4/build.vc10/ecm/ecm.vcxproj0000644023561000001540000003056712106741270014022 00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} ecm Win32Proj Application MultiByte Application MultiByte Application MultiByte Application MultiByte <_ProjectFileVersion>10.0.30128.1 $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false $(SolutionDir)..\bin\$(Platform)\Release\ $(Platform)\Release\ false $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ true $(SolutionDir)..\bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ true Full true Speed ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) MultiThreaded Level3 ProgramDatabase Default true ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib true Console true true false MachineX86 X64 Full true Speed ..\..\..\$(mp_dir)lib\$(IntDir);%(AdditionalIncludeDirectories) WIN32;_WIN64;NDEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) MultiThreaded Level3 ProgramDatabase Default true ws2_32.lib;..\..\..\$(mp_dir)lib\$(Platform)\release\$(mp_lib);%(AdditionalDependencies) true Console true true false MachineX64 8388608 65536 Disabled ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebug Level3 EditAndContinue Default true ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib true Console false MachineX86 X64 Disabled ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_WIN64;_DEBUG;_CONSOLE;OUTSIDE_LIBECM;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebug Level3 ProgramDatabase Default true ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib true Console false MachineX64 8388608 65536 ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) ..\..\..\$(mp_dir)lib\$(IntDir)$(mp_lib);advapi32.lib;ws2_32.lib {cd555681-d65b-4173-a29c-b8bf06a4871b} false ecm-6.4.4/build.vc10/ecm/Makefile.am0000644023561000001540000000005512106741270013662 00000000000000EXTRA_DIST = ecm.vcxproj ecm.vcxproj.filters ecm-6.4.4/build.vc10/mul_fft-params.h.win32.intel0000644023561000001540000001073112106741271016226 00000000000000#define MUL_FFT_MODF_THRESHOLD 480 #define SQR_FFT_MODF_THRESHOLD 480 #define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {305, 5 /*95*/}, {321, 4 /*97*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {801, 6 /*96*/}, {1281, 7 /*91*/}, {1409, 6 /*97*/}, {1601, 7 /*92*/}, {1921, 6 /*98*/}, {1985, 7 /*94*/}, {2689, 8 /*91*/}, {2817, 7 /*95*/}, {3201, 8 /*92*/}, {3329, 7 /*96*/}, {3457, 8 /*87*/}, {3841, 7 /*96*/}, {3969, 8 /*88*/}, {4865, 7 /*97*/}, {4993, 8 /*90*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*83*/}, {11777, 8 /*97*/}, {12033, 9 /*85*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {15873, 8 /*98*/}, {16129, 9 /*88*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {26113, 10 /*81*/}, {31745, 9 /*98*/}, {34305, 10 /*85*/}, {39937, 9 /*98*/}, {40449, 10 /*83*/}, {48129, 11 /*75*/}, {63489, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {129025, 9 /*98*/}, {130561, 11 /*80*/}, {194561, 12 /*75*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 9 /*99*/}, {278017, 10 /*94*/}, {293889, 9 /*99*/}, {294401, 7 /*99*/}, {294529, 8 /*99*/}, {294657, 10 /*94*/}, {310273, 9 /*99*/}, {310785, 10 /*95*/}, {326657, 12 /*83*/}, {389121, 13 /*75*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {662529, 11 /*96*/}, {686081, 10 /*99*/}, {687105, 9 /*99*/}, {687617, 11 /*95*/}, {718849, 10 /*99*/}, {752641, 9 /*99*/}, {753153, 11 /*95*/}, {784385, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {980993, 10 /*99*/}, {982017, 12 /*93*/}, {LONG_MAX, 0}} #define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {273, 5 /*94*/}, {289, 4 /*97*/}, {305, 5 /*95*/}, {609, 6 /*95*/}, {641, 5 /*97*/}, {673, 6 /*95*/}, {705, 5 /*97*/}, {737, 6 /*96*/}, {1473, 7 /*96*/}, {1537, 6 /*98*/}, {1601, 7 /*96*/}, {1665, 6 /*98*/}, {1729, 7 /*96*/}, {2689, 8 /*91*/}, {2817, 7 /*97*/}, {2945, 8 /*92*/}, {3329, 7 /*98*/}, {3457, 8 /*93*/}, {5377, 9 /*91*/}, {5633, 8 /*95*/}, {6401, 9 /*92*/}, {6657, 8 /*96*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {7937, 9 /*88*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {24065, 10 /*85*/}, {27649, 11 /*87*/}, {30721, 10 /*96*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {39937, 11 /*83*/}, {47105, 10 /*97*/}, {48129, 12 /*75*/}, {61441, 11 /*96*/}, {63489, 10 /*98*/}, {68609, 11 /*85*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 12 /*85*/}, {323585, 10 /*99*/}, {326657, 9 /*99*/}, {327169, 10 /*95*/}, {330753, 12 /*84*/}, {389121, 10 /*99*/}, {392193, 9 /*99*/}, {392705, 10 /*96*/}, {408577, 9 /*99*/}, {409089, 8 /*99*/}, {409345, 10 /*96*/}, {412673, 12 /*90*/}, {454657, 13 /*87*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {555009, 10 /*99*/}, {556033, 9 /*99*/}, {556545, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {654337, 11 /*95*/}, {686081, 13 /*87*/}, {778241, 11 /*99*/}, {817153, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {915457, 12 /*93*/}, {978945, 14 /*93*/}, {LONG_MAX, 0}} #define MUL_FFT_FULL_TABLE2 {{100, 2}, {216, 1}, {256, 2}, {264, 1}, {304, 2}, {312, 1}, {544, 4}, {560, 1}, {704, 2}, {720, 1}, {896, 2}, {960, 7}, {40960, 2}, {47616, 1}, {49152, 6}, {53760, 4}, {56320, 1}, {64512, 4}, {71680, 5}, {86016, 2}, {96768, 4}, {99840, 1}, {131072, 6}, {136192, 7}, {147456, 6}, {150528, 4}, {161280, 1}, {161792, 3}, {172032, 2}, {193536, 1}, {259072, 6}, {286720, 7}, {294912, 6}, {301056, 4}, {322560, 3}, {344064, 2}, {387072, 1}, {393216, 4}, {404480, 3}, {409600, 1}, {417792, 3}, {425984, 1}, {524288, 6}, {530432, 7}, {557056, 6}, {566272, 5}, {577536, 4}, {593920, 6}, {602112, 5}, {614400, 4}, {645120, 3}, {647168, 4}, {652800, 1}, {654336, 6}, {673792, 3}, {688128, 2}, {724992, 4}, {727040, 1}, {753664, 2}, {783360, 4}, {816640, 6}, {831488, 1}, {851968, 2}, {860160, 3}, {868352, 2}, {881664, 7}, {884736, 1}, {921600, 7}, {950272, 1}, {LONG_MAX, 1}} ecm-6.4.4/build.vc10/vsyasm.targets0000644023561000001540000001046312106741270014003 00000000000000 _YASM $(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml @(YASM, '|') $(ComputeLinkInputsTargets); ComputeYASMOutput; $(ComputeLibInputsTargets); ComputeYASMOutput; ecm-6.4.4/build.vc10/ecm-params.h.win32.amd0000644023561000001540000000115012106741271014757 00000000000000/* this is the parameter file for Opteron */ #define MPZMOD_THRESHOLD 170 #define REDC_THRESHOLD 294 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define SPV_NTT_GFP_DIF_RECURSIVE_THRESHOLD 32768 #define SPV_NTT_GFP_DIT_RECURSIVE_THRESHOLD 32768 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/build.vc10/libecm/0000755023561000001540000000000012113421641012370 500000000000000ecm-6.4.4/build.vc10/libecm/libecm.vcxproj0000644023561000001540000002754712106741270015204 00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {CD555681-D65B-4173-A29C-B8BF06A4871B} libecm Win32Proj StaticLibrary MultiByte StaticLibrary MultiByte StaticLibrary MultiByte StaticLibrary MultiByte Static <_ProjectFileVersion>10.0.30128.1 $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ $(SolutionDir)..\lib\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ ecmlib $(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ cd $(SolutionDir) call file_copy ecm-params.h.win32.intel ..\ecm-params.h call file_copy mul_fft-params.h.win32.intel ..\mul_fft-params.h call file_copy config.h ..\config.h Full true ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;NDEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) MultiThreaded Level3 Default cd $(SolutionDir) call file_copy ecm-params.h.x64.intel ..\ecm-params.h call file_copy mul_fft-params.h.x64.intel ..\mul_fft-params.h call file_copy config.h ..\config.h X64 Full true ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_WIN64;NDEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) MultiThreaded Level3 Default _WIN64 cd $(SolutionDir) call file_copy ecm-params.h.win32.intel ..\ecm-params.h call file_copy mul_fft-params.h.win32.intel ..\mul_fft-params.h call file_copy config.h ..\config.h Disabled ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_DEBUG;_LIB;SSE2;USE_ASM_REDC;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebug Level3 Default cd $(SolutionDir) call file_copy ecm-params.h.x64.intel ..\ecm-params.h call file_copy mul_fft-params.h.x64.intel ..\mul_fft-params.h call file_copy config.h ..\config.h X64 Disabled ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\;%(AdditionalIncludeDirectories) WIN32;_WIN64;_DEBUG;_LIB;USE_ASM_REDC;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebug Level3 Default _WIN64 MaxSpeed Full Full Full Full ecm-6.4.4/build.vc10/libecm/libecm.vcxproj.filters0000644023561000001540000001175712106741270016647 00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hpp;hxx;hm;inl;inc;xsd {2f18179f-5dba-420c-8dc7-bc7f8228a1b2} Source Files\Assembler Source Files\Assembler Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files ecm-6.4.4/build.vc10/libecm/Makefile.am0000644023561000001540000000006512106741270014352 00000000000000EXTRA_DIST = libecm.vcxproj libecm.vcxproj.filters ecm-6.4.4/build.vc10/readme.txt0000644023561000001540000001074412106741271013067 00000000000000 Building GMP-ECM with Microsoft Visual C++ 2010 (version 10) =========================================================== If you wish to build the assembler code support you will need to install the YASM assembler that is available at: http://www.tortall.net/projects/yasm/ THe version you need is vsyasm, which should be put it in the same directory as your Visual C++ compiler, which is typically: C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin The Multi-Precision Library - GMP and MPIR ========================================== GMP-ECM works with either GMP or MPIR, a fork of GMP. To build and run GMP-ECM using Visual Studio you first need to obtain and build either GMP or MPIR. MPIR has a fully integrated Visual Studio build system for Windows but GMP does not. The VC++ build of GMP-ECM now defaults to MPIR but the property sheet mp_lib.vsprops can be edited to set the macro mp_lib to 'gmp' instead of 'mpir' to build ECM using GMP. GMP === GMP can be built from the GMP source code available here: http://gmplib.org/ using the Visual Studio build files I provide here: http://www.gladman.me.uk/computing/gmp4win.php But these are based on GMP 4.2.x and are no longer being maintained. GMP 4.3.x can be built using cygwin or mingw for win32 and it is reported that the resulting libraries work with Visual Studio when appropriately renamed. It may also be possible to build the generic C version of GMP for 64-bit Windows systems using mingw64. But this version will be fairly slow because it cannot use the fast assembler normally used by GMP because this is not available in Windows format. MPIR ==== MPIR is available here: http://www.mpir.org It has full support for building MPIR for 32 and 64 bit Windows systems with x86 assembler support using the YASM assembler. In particular it includes fast assembler code for modern AMD and Intel architectures running in 64-bit mode on Windows (not available in GMP). Building GMP-ECM ================ The build files for GMP-ECM assume that the GMP and ECM build directories are in a common parent directory as follows: Parent Directory MPIR (or GMP) build.vc10 -- MPIR (or GMP) build files ... GMP-ECM buid.vc10 -- ECM build files The root directories for GMP and GMP-ECM are assumed to have these names irrespective of which version is being used (they used to be followed by version numbers but this meant that the build projects had to be updated too frequently). There are three build projects in build.vc10: ecm - the ECM application ecmlib - the ECM library tune - a program for tuning Before starting a build, these two files ecm-params.h mul_fft-params.h to set the tuning parameters that should be used in the build. Select the tuning include files by changing the appropriate '#elif 0' to '#elif 1'. If you wish to use the win32 AMD assembler files, you also have to use the Visual Studio property page to define AMD_ASM (althernively you can eidt mulredc.asm and redc.asm in the build.vc10\assembler\ directory to include the AMD assembler). When a version of ecm and ecmlib are built the library and the application are put in the directory matching the configuration that has been built: GMP-ECM build.vc10 -- ECM build files lib -- ECM static library files dll -- ECM dynamic library files bin -- ECM executable files within these lib, dll and bin directories, the outputs are located in sub-directories determined by the platform and configuration: win32\release win32\debug x64\release x64\debug If you don't want assembler support you need to change the define: #define NATIVE_REDC 1 in config.h (in the build.vc10 subdirectory) to: #undef NATIVE_REDC Tune ==== If tune is compiled and run for a particular configuration it will output a file with appropriate parameters for this configuration with a name suuch as: ecm-params.h.win32.amd.new To use this file when building ecm and ecmlib, remove the '.new' extension and add a reference to it in the ecm-param.h file in the build.vc10 directory. Tests ===== The file tests.py is a python script that runs the ECM tests. It runs the x64/release-amd version by default but can be edited to test other builds. Brian Gladman, 3rd January 2012 ecm-6.4.4/build.vc10/getrusage.h0000644023561000001540000000271412106741271013226 00000000000000 #ifndef _GETRUSAGE_H #define _GETRUSAGE_H #if defined(__cplusplus) extern "C" { #endif #define ENODATA 61 #define RUSAGE_SELF 0 #define RUSAGE_CHILDREN -1 typedef struct { long tv_sec; long tv_usec; } tval; typedef struct rusage { tval ru_utime; /* user time used */ tval ru_stime; /* system time used */ long ru_maxrss; /* integral max resident set size */ long ru_ixrss; /* integral shared text memory size */ long ru_idrss; /* integral unshared data size */ long ru_isrss; /* integral unshared stack size */ long ru_minflt; /* page reclaims */ long ru_majflt; /* page faults */ long ru_nswap; /* swaps */ long ru_inblock; /* block input operations */ long ru_oublock; /* block output operations */ long ru_msgsnd; /* messages sent */ long ru_msgrcv; /* messages received */ long ru_nsignals;/* signals received */ long ru_nvcsw; /* voluntary context switches */ long ru_nivcsw; /* involuntary context switches */ } rusage; int getrusage(int who, rusage *usage); #if defined(__cplusplus) } #endif #endif ecm-6.4.4/build.vc10/vsyasm.props0000644023561000001540000000231712106741270013474 00000000000000 Midl CustomBuild _SelectedFiles;$(YASMDependsOn) C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\ False $(IntDir) 0 0 "$(YasmPath)"vsyasm.exe -Xvc -f $(Platform) [AllOptions] [AdditionalOptions] [Inputs] %(ObjectFile) Assembling %(Filename)%(Extension) false ecm-6.4.4/build.vc10/bench_mulredc/0000755023561000001540000000000012113421642013730 500000000000000ecm-6.4.4/build.vc10/bench_mulredc/bench_mulredc.vcxproj0000644023561000001540000002150712106741270020070 00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {4727DE12-787D-432D-B166-BF103B0C3C87} Win32Proj bench_mulredc Application true Unicode Application true Unicode Application false true Unicode Application false true Unicode true $(SolutionDir)..bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ true $(SolutionDir)..bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false $(SolutionDir)..bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ false $(SolutionDir)..bin\$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ Level3 Disabled WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ MultiThreadedDebug Console true psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) Level3 Disabled _WIN64;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ MultiThreadedDebug Console true psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) Level3 MaxSpeed true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ MultiThreaded Console true true true psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) Level3 MaxSpeed true true _WIN64;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) ..\..\..\$(mp_dir)lib\$(IntDir);..\..\;..\assembler;..\ MultiThreaded Console true true true psapi.lib;..\..\..\$(mp_dir)lib\$(IntDir)\mpir.lib;..\..\lib\$(IntDir)\libecm.lib;%(AdditionalDependencies) ecm-6.4.4/build.vc10/bench_mulredc/bench_mulredc.vcxproj.filters0000644023561000001540000000130612106741270021532 00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hpp;hxx;hm;inl;inc;xsd Source Files ecm-6.4.4/build.vc10/bench_mulredc/Makefile.am0000644023561000001540000000010112106741270015700 00000000000000EXTRA_DIST = bench_mulredc.vcxproj bench_mulredc.vcxproj.filters ecm-6.4.4/build.vc10/mul_fft-params.h.x64.intel0000644023561000001540000000017512106741271015706 00000000000000/* Empty file so that #include won't produce an error message. With no parameters defined, mul_fft.c will use defaults. */ecm-6.4.4/build.vc10/tests.py0000644023561000001540000003035712106741270012606 00000000000000 from __future__ import print_function import os import sys import string import platform from re import match from subprocess import Popen, PIPE, STDOUT from tempfile import * from time import clock class Timer() : def __enter__(self): self.start = clock() def __exit__(self, *args): print(' time {:.3f} milliseconds'.format(1000 * (clock() - self.start))) test_dir = '..\\bin\\x64\\Release\\' # test_dir = '..\\bin\\win32\\Release\\' ecm = [ ("2050449353925555290706354283", "-sigma 7 -k 1 30 0-1e6", 14), ("137703491", "-sigma 6 84 1000", 8), ("3533000986701102061387017352606588294716061", "-sigma 1621 191 225", 14), ("145152979917007299777325725119", "-sigma 711387948 924 117751", 14), ("2^919-1", "-sigma 262763035 937 1", 6), ("2^919-1", "-sigma 1691973485 283 1709", 6), ("(2^1033+1)/3", "-sigma 2301432245 521 1", 6), ("(2^1033+1)/3", "-sigma 2301432245 223 1847", 6), ("(2^1063+1)/3/26210488518118323164267329859", "-sigma 2399424618 383 1", 6), ("(2^1063+1)/3/26210488518118323164267329859", "-sigma 2399424618 71 500", 6), ("242668358425701966181147598421249782519178289604307455138484425562807899", "-sigma 1417477358 28560 8e7-85507063", 14), ("3533000986701102061387017352606588294716061", "-sigma 291310394389387 191 225", 14), ("121279606270805899614487548491773862357", "-sigma 1931630101 120", 14), ("291310394389387", "-power 3 -sigma 40 2000", 8), ("3533000986701102061387017352606588294716061", "-sigma 3547 167 211", 14), ("449590253344339769860648131841615148645295989319968106906219761704350259884936939123964073775456979170209297434164627098624602597663490109944575251386017", "-sigma 63844855 -go 172969 61843 20658299", 14), ("17061648125571273329563156588435816942778260706938821014533", "-sigma 585928442 174000", 14), ("89101594496537524661600025466303491594098940711325290746374420963129505171895306244425914080753573576861992127359576789001", "-sigma 877655087 -go 325001 157721 1032299", 14), ("5394204444759808120647321820789847518754252780933425517607611172590240019087317088600360602042567541009369753816111824690753627535877960715703346991252857", "-sigma 805816989 -go 345551 149827", 6), ("3923385745693995079670229419275984584311007321932374190635656246740175165573932140787529348954892963218868359081838772941945556717", "-sigma 876329474 141667 150814537", 14), ("124539923134619429718018353168641490719788526741873602224103589351798060075728544650990190016536810151633233676972068237330360238752628542584228856301923448951", "-sigma 1604840403 -go 983591971839332299 96097 24289207", 14), ("5735013127104523546495917836490637235369", "-power 60 -k 2 -A 3848610099745584498259560038340842096471 -x0 2527419713481530878734189429997880136878 330000 500000000", 8), ("17833653493084084667826559287841287911473", "-power 6 -k 2 -A 7423036368129288563912180723909655170075 -x0 9011819881065862648414808987718432766274 389797 16e8", 8), ("212252637915375215854013140804296246361", "-power 15 -k 2 -sigma 781683988 1000000", 8), ("4983070578699621345648758795946786489699447158923341167929707152021191319057138908604417894224244096909460401007237133698775496719078793168004317119431646035122982915288481052088094940158965731422616671", "-sigma 909010734 122861 176711", 6), ("1408323592065265621229603282020508687", "-sigma 1549542516 -go 2169539 531571 29973883000-29973884000", 8), ("3213162276640339413566047915418064969550383692549981333701", "-sigma 2735675386 -go 1615843 408997 33631583", 8), ("39614081257132168796771975177", "-sigma 480 1e6", 8), ("10000286586958753753", "-sigma 3956738175 1e6", 8), ("49672383630046506169472128421", "-sigma 2687434659 166669 86778487", 8), ("216259730493575791390589173296092767511", "-sigma 214659179 1124423 20477641", 8), ("49367108402201032092269771894422156977426293789852367266303146912244441959559870316184237", "-sigma 6 5000", 0), ("(2^1063+1)/3/26210488518118323164267329859", "-sigma 2399424618 383 1", 6), ("10090030271*10^400+696212088699", "-sigma 3923937547 1e3 1e6", 14), ("458903930815802071188998938170281707063809443792768383215233", "-batch -A 103699173453039012668349162616750601868936199904547322268878 10000", 14), ("458903930815802071188998938170281707063809443792768383215233", "-batch=2 -A 103699173453039012668349162616750601868936199904547322268878 10000", 14), ("2^349-1", "-batch -A 13883915733485915535567641090102088744917579395318243004655770450844428217574163575149253565087742 587 29383", 6), ("2^349-1", "-batch=2 -A 13883915733485915535567641090102088744917579395318243004655770450844428217574163575149253565087742 587 29383", 6), ("2^347-1", "-batch -A 292897222300654795048417351458499833714895857628156011078988080472621879897670335421898676171177982 3301 229939", 14), ("2^347-1", "-batch=2 -A 292897222300654795048417351458499833714895857628156011078988080472621879897670335421898676171177982 3301 229939", 14), ("911962091", "-batch=2 -A 440688534 50000", 8), ("31622776601683791911", "-batch=1 -A 27063318473587686303 11000", 0), ("18446744073709551557", "-batch -A 312656731337392125 11000", 8), ("4294967291", "-batch -A 17 1000", 8), ("((173^173+1)/174)/471462511391940575680645418941", "-sigma 12345 20000", 0), ("((173^173+1)/174)/471462511391940575680645418941+122", "-sigma 77 20000", 6), ("10000000000000000000000000000000000000121", "-sigma 61 -go 1195504287780095287 2950307", 8), ("10000000000000000000000000000000000000121", "-sigma 266 -go 218187387944803649 9405629", 8), ("10000000000000000000000000000000000000121", "-sigma 291 -go 5994496018878137 4372759", 8) ] pm1 = [ ("441995541378330835457", "-pm1 -x0 3 157080 7e9-72e8", 8 ), ("335203548019575991076297", "-pm1 -x0 2 23 31", 8 ), ("335203548019575991076297", "-pm1 -x0 3 31 58766400424189339249-58766400424189339249", 8 ), ("2050449353925555290706354283", "-pm1 -k 1 20 0-1e6", 14 ), ("67872792749091946529", "-pm1 -x0 3 8467 11004397", 8 ), ("5735039483399104015346944564789", "-pm1 1277209 9247741", 8 ), ("620224739362954187513", "-pm1 -x0 3 668093 65087177", 8 ), ("1405929742229533753", "-pm1 1123483 75240667", 8 ), ("16811052664235873", "-pm1 -x0 3 19110 178253039", 8 ), ("9110965748024759967611", "-pm1 1193119 316014211", 8 ), ("563796628294674772855559264041716715663", "-pm1 4031563 14334623", 8 ), ("188879386195169498836498369376071664143", "-pm1 3026227 99836987", 8 ), ("474476178924594486566271953891", "-pm1 9594209 519569569", 8 ), ("2124306045220073929294177", "-pm1 290021 1193749003", 8 ), ("504403158265489337", "-pm1 -k 4 8 9007199254740700-9007199254740900", 8 ), ("6857", "-pm1 840 857", 8 ), ("10090030271*10^400+696212088699", "-pm1 2e3 2e6", 14), ("2^(64*2)-1", "-pm1 -redc -x0 -1 2 1", 8), ("234^997+997^234", "-pm1 -ntt 100 324", 0) # Try saving and resuming # ("25591172394760497166702530699464321", "-pm1 -save test.pm1.save 100000 # checkcode $? 0 # $PM1 -resume test.pm1.save 120557 2007301 # C=$? # /bin/rm -f test.pm1.save # checkcode $C 8 ), ] pp1 = [ ("574535754974673735383001137423881", "-pp1 -x0 5 11046559 34059214979", 8 ), ("1212493270942550395500491620526329", "-pp1 -x0 9 1322743 15132776749", 8 ), ("12949162694219360835802307", "-pp1 -x0 5 3090877 362336209", 8 ), ("2224933405617843870480157177909", "-pp1 -x0 6 568751 573379", 8 ), ("6588443517876550825940165572081", "-pp1 -x0 5 308141 4213589", 8 ), ("951513164333845779921357796547797", "-pp1 -x0 5 991961 1927816573", 8 ), ("30273798812158206865862514296968537", "-pp1 -x0 5 24039443 5071284641", 8 ), ("4745647757936790297247194404494391", "-pp1 -x0 9 34652707 4267610467", 8 ), ("1267992248510159742851354500921987", "-pp1 -x0 5 205435127 3011959669", 8 ), ("3376019969685846629149599470807382641", "-pp1 -x0 5 16221563 125604601", 8 ), ("14783171388883747638481280920502006539", "-pp1 -x0 5 5963933 549138481", 8 ), ("884764954216571039925598516362554326397028807829", "-pp1 -x0 6 80105797 2080952771", 8 ), ("5703989257175782343045829011448227", "-pp1 -x0 6 2737661 581697661", 8 ), ("36542278409946587188439197532609203387", "-pp1 -x0 5 75484441 721860287", 8 ), ("23737785720181567451870298309457943", "-pp1 -x0 7 138563 9639649", 8 ), ("9535226150337134522266549694936148673", "-pp1 -x0 7 3037709 84506953", 8 ), ("68095768294557635629913837615365499", "-pp1 -x0 5 36936017 167452427", 8 ), ("3180944478436233980230464769757467081", "-pp1 -x0 5 7373719 764097571", 8 ), ("2879563791172315088654652145680902993", "-pp1 -x0 7 29850409 34290301", 8 ), ("79382035150980920346405340690307261392830949801", "-pp1 -x0 5 12073627 32945877451", 8 ), ("514102379852404115560097604967948090456409", "-pp1 -x0 8 223061 61500567937", 8 ), ("173357946863134423299822098041421951472072119", "-pp1 -x0 5 992599901 1401995848117", 8 ), ("183707757246801094558768264908628886377124291177", "-pp1 -x0 5 382807709 1052258680511", 8 ), ("16795982678646459679787538694991838379", "-pp1 -x0 6 2957579 26509499", 8 ), # ("7986478866035822988220162978874631335274957495008401", "-pp1 -x0 17 1632221953 843497917739, 8), # ("725516237739635905037132916171116034279215026146021770250523", "-pp1 -x0 5 51245344783 483576618980159", 8 ), ("1809864641442542950172698003347770061601055783363", "-pp1 -x0 6 21480101 12037458077389", 8 ), ("435326731374486648601801668751442584963", "-pp1 -x0 6 12002513 27231121", 8 ), ("3960666914072777038869829205072430197479", "-pp1 -x0 5 16534249 21802223243", 8) ] pp1_2 = [ ("328006342451", "-pp1 -x0 5 120 7043", 8 ), ("328006342451", "-pp1 -x0 1/5 120 7043", 8 ), ("2050449218179969792522461197", "-pp1 -x0 6 -k 1 20 0-1e6", 14), ("6215074747201", "-pp1 -power 2 -x0 5 630 199729", 8 ), ("6215074747201", "-pp1 -dickson 3 -x0 5 630 199729", 8 ), ("8857714771093", "-pp1 -x0 3 23251 49207", 8 ), ("236344687097", "-pp1 -x0 3 619 55001", 8 ), ("87251820842149", "-pp1 -x0 5 3691 170249", 8 ), ("719571227339189", "-pp1 -x0 4 41039 57679", 8 ), ("5468575720021", "-pp1 -x0 6 1439 175759", 8 ), ("49804972211", "-pp1 -x0 5 15443 268757", 8 ), ("329573417220613", "-pp1 -x0 3 5279 101573", 8 ), ("4866979762781", "-pp1 -x0 4 7309 97609", 8 ), ("187333846633", "-pp1 -x0 3 2063 9851", 8 ), ("332526664667473", "-pp1 -x0 3 65993 111919", 8 ), ("265043186297", "-pp1 -x0 3 8761 152791", 8 ), ("207734163253", "-pp1 -x0 3 1877 4211", 8 ), ("225974065503889", "-pp1 -x0 5 -k 5 7867 8243", 8 ), ("660198074631409", "-pp1 -x0 5 22541 115679", 8 ), ("563215815517", "-pp1 -x0 3 3469 109849", 8 ), ("563215815517", "-pp1 -x0 3 3469 109849-109849", 8 ), ("409100738617", "-pp1 -x0 3 19 19", 8 ), ("2277189375098448170118558775447117254551111605543304035536750762506158547102293199086726265869065639109", "-pp1 -x0 3 2337233 132554351", 14), ("630503947831861669", "-pp1 -x0 5 7 9007199254740000-9007199254741000", 8 ), ("8589934621", "-pp1 -x0 10 4294967310-4294967311 1", (1, 8) ), ("6054018161*10^400+417727253109", "-pp1 -x0 4 2e3 2e6", 14), ("154618728587", "-pp1 -x0 3 -go 36 4294957296-4294967295 1", 8) ] c200 = [ ("29799904256775982671863388319999573561548825027149399972531599612392671227006866151136667908641695103422986028076864929902803267437351318167549013218980573566942647077444419419003164546362008247462049", "-pm1 2 1e10", 0) ] test = [ ("173357946863134423299822098041421951472072119", "-pp1 -x0 5 992599901 1401995848117", 8 ), ] def run_exe(exe, args, inp) : al = {'stdin' : PIPE, 'stdout' : PIPE, 'stderr' : STDOUT } if sys.platform.startswith('win') : al['creationflags'] = 0x08000000 p = Popen([exe] + args.split(' '), **al) res = p.communicate(inp.encode())[0].decode() ret = p.poll() return (ret, res) def do_tests(tests) : global out exe = test_dir + "ecm.exe" for tt in tests : rv = run_exe(exe, tt[1], tt[0]) if type(tt[2]) == int and rv[0] != tt[2] : print("*** ERROR ***", rv[0], tt[2]) elif type(tt[2]) == tuple and \ rv[0] != tt[2][0] and rv[0] != tt[2][1] : print("*** ERROR ***", rv[0], tt[2]) if out : op = rv[1].rsplit('\r\n') for i in op : print(i) with Timer(): out = True do_tests(ecm) do_tests(pm1) do_tests(pp1) do_tests(pp1_2) do_tests(c200) do_tests(test) ecm-6.4.4/build.vc10/ecm-params.h.x64.intel0000644023561000001540000000122112106741271015007 00000000000000/* created 06 Feb 2012 on confit.loria.fr (Intel(R) Core(TM) i5-2500 CPU) for svn revision 1705 with GMP 5.0.3 and gcc 4.6.1 */ #define TUNE_MULREDC_THRESH 21 #define TUNE_SQRREDC_THRESH 9 #define MPZMOD_THRESHOLD 77 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 11, 12, 12, 13, 14, 13, 14, 14, 16, 16, 16, 16, 16, 16} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 8 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 64 #define MPZSPV_NORMALISE_STRIDE 256 ecm-6.4.4/build.vc10/vsyasm.xml0000644023561000001540000002064312106741270013133 00000000000000 General Symbols Files Command Line Execute Before Specifies the targets for the build customization to run before. Execute After Specifies the targets for the build customization to run after. Additional Options Additional Options ecm-6.4.4/build.vc10/Makefile.am0000644023561000001540000000075012106741271013121 00000000000000EXTRA_DIST = config.h ecm-params.h ecm-params.h.win32.amd \ ecm-params.h.win32.intel ecm-params.h.x64.amd \ ecm-params.h.x64.intel ecm.sln file_copy.bat \ mp_lib.props mul_fft-params.h.win32.amd \ mul_fft-params.h.win32.intel mul_fft-params.h.x64.amd \ mul_fft-params.h.x64.intel readme.txt tests.py \ vsyasm.props vsyasm.targets vsyasm.xml getrusage.h DIST_SUBDIRS = assembler ecm libecm tune bench_mulredc ecm-6.4.4/build.vc10/file_copy.bat0000644023561000001540000000027112106741271013524 00000000000000if not exist %1 ( echo file_copy failure: %1 not found && goto exit ) if exist %2 ( fc %1 %2 > nul && if not %errorlevel 1 goto exit ) echo copying %1 to %2 && copy %1 %2 :exit ecm-6.4.4/build.vc10/ecm-params.h.win32.intel0000644023561000001540000000127112106741270015334 00000000000000/* those parameters were obtained on toto.loria.fr with ecm-6.3-rc3 gmp-5.0.1, and gcc 4.0.2 -m32 -O2 -pedantic -fomit-frame-pointer -mtune=pentium3 -march=pentium3 */ #define TUNE_MULREDC_THRESH 1 #define TUNE_SQRREDC_THRESH 1 #define MPZMOD_THRESHOLD 98 #define REDC_THRESHOLD 398 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 12, 12, 1, 14, 12, 13, 1, 15, 16, 15, 16, 19, 20, 22} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 256 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 1024 ecm-6.4.4/build.vc10/config.h0000644023561000001540000001430512106753344012510 00000000000000/* config.h.in. Generated from configure.in by autoheader. */ #define VERSION "6.4.4" #define VERSION_GPU "gpu_ecm-win" #define PACKAGE_BUGREPORT "ecm-discuss@lists.gforge.inria.fr" /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP systems. This function is required for `alloca.c' support on those systems. */ #undef CRAY_STACKSEG_END /* Define to 1 if using `alloca.c'. */ #define C_ALLOCA 1 /* Define to 1 if you have the `access' function. */ #undef HAVE_ACCESS /* Define to 1 if you have `alloca', as a function or macro. */ #define HAVE_ALLOCA 1 /* Define to 1 if you have and it should be used (not on Ultrix). */ #undef HAVE_ALLOCA_H /* Define to 1 if you have the `ctime' function. */ #define HAVE_CTIME 1 /* Define to 1 if you have the header file. */ #define HAVE_CTYPE_H 1 /* Define to 1 if you have the `floor' function. */ #define HAVE_FLOOR 1 /* Define to 1 if you have the `fmod' function. */ #define HAVE_FMOD 1 /* Define to 1 if you have the `gethostname' function. */ #define HAVE_GETHOSTNAME 1 /* Define to 1 if you have the `getrusage' function. */ #define HAVE_GETRUSAGE 1 /* Define to 1 if you have the `gettimeofday' function. */ #undef HAVE_GETTIMEOFDAY /* Define to 1 if you have the header file. */ #define HAVE_GMP_H 1 /* Define to 1 if gwnum.a or gwnum.lib exist */ #undef HAVE_GWNUM /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_IO_H /* Define to 1 if you have the `isascii' function. */ #undef HAVE_ISASCII /* Define to 1 if you have the `isdigit' function. */ #define HAVE_ISDIGIT 1 /* Define to 1 if you have the `isspace' function. */ #define HAVE_ISSPACE 1 /* Define to 1 if you have the `isxdigit' function. */ #define HAVE_ISXDIGIT 1 /* Define to 1 if you have the `m' library (-lm). */ #undef HAVE_LIBM /* Define to 1 if you have the header file. */ #define HAVE_LIMITS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_MALLOC_H 1 /* Define to 1 if you have the `malloc_usable_size' function. */ #undef HAVE_MALLOC_USABLE_SIZE /* Define to 1 if you have the header file. */ #define HAVE_MATH_H 1 /* Define to 1 if you have the `memmove' function. */ #define HAVE_MEMMOVE 1 /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* Define to 1 if you have the `memset' function. */ #define HAVE_MEMSET 1 /* Define to 1 if you have the `nice' function. */ #undef HAVE_NICE /* Define to 1 if you have the `pow' function. */ #define HAVE_POW 1 /* Define to 1 if you have the `signal' function. */ #define HAVE_SIGNAL 1 /* Define to 1 if you have the header file. */ #define HAVE_SIGNAL_H 1 /* Define to 1 if you have the `sqrt' function. */ #define HAVE_SQRT 1 /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the `strchr' function. */ #define HAVE_STRCHR 1 /* Define to 1 if you have the header file. */ #undef HAVE_STRINGS_H /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlen' function. */ #define HAVE_STRLEN 1 /* Define to 1 if you have the `strncasecmp' function. */ #undef HAVE_STRNCASECMP /* Define to 1 if you have the `strstr' function. */ #undef HAVE_STRSTR /* Define to 1 if you have the header file. */ #undef HAVE_SYS_RESOURCE_H /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TIME_H /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* Define to 1 if you have the `time' function. */ #undef HAVE_TIME /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H /* Define to 1 if you have the `unlink' function. */ #define HAVE_UNLINK 1 /* Define to 1 if you have the header file. */ #define HAVE_WINDOWS_H 1 /* Define to 1 if you have the `__gmpn_add_nc' function. */ #if defined( _WIN64 ) # define HAVE___GMPN_ADD_NC 1 #endif /* Define to 1 if you have the `__gmpn_mod_34lsub1' function. */ #define HAVE___GMPN_MOD_34LSUB1 1 /* Define to 1 if you have the `__gmpn_mul_fft' function. */ #define HAVE___GMPN_MUL_FFT 1 /* Define to 1 if you want memory debugging */ #undef MEMORY_DEBUG /* Define if the system has the type `long long'. */ #define HAVE_LONG_LONG 1 #define HAVE_LONG_LONG_INT 1 /* Define to 1 to use asm redc on x86 or x86_64 */ # define NATIVE_REDC 1 /* Define to 1 if your C compiler doesn't accept -c and -o together. */ #undef NO_MINUS_C_MINUS_O /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be automatically deduced at runtime. STACK_DIRECTION > 0 => grows toward higher addresses STACK_DIRECTION < 0 => grows toward lower addresses STACK_DIRECTION = 0 => direction of growth unknown */ #undef STACK_DIRECTION /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 /* Define to 1 if you can safely include both and . */ #undef TIME_WITH_SYS_TIME /* Define to 1 if you want assertions enabled */ #undef WANT_ASSERT /* Define to 1 if you want shell command execution */ #undef WANT_SHELLCMD /* Define to empty if `const' does not conform to ANSI C. */ #undef const /* How to specify hot-spot attribute, if available */ #define ATTRIBUTE_HOT #define HAVE___GMPN_REDC_1 1 #define HAVE___GMPN_REDC_2 1 #define HAVE_ASM_REDC3 1 #define WINDOWS64_ABI 1 /* Define to `__inline__' or `__inline' if that's what the C compiler calls it, or to nothing if 'inline' is not supported under any name. */ #ifndef __cplusplus #define inline __inline #endif /* Define to `unsigned int' if does not define. */ #undef size_t #ifdef _MSC_VER # if _MSC_VER < 1600 # define int64_t __int64 # define uint64_t unsigned __int64 # endif # define strncasecmp strnicmp # define alloca _alloca # define fseek64 _fseek64 # define ftell64 _ftell64 #endif ecm-6.4.4/build.vc10/ecm.sln0000644023561000001540000000713512106741271012353 00000000000000Microsoft Visual Studio Solution File, Format Version 11.00 # Visual Studio 2010 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libecm", "libecm\libecm.vcxproj", "{CD555681-D65B-4173-A29C-B8BF06A4871B}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecm", "ecm\ecm.vcxproj", "{C0E2EA85-996A-4B5F-AD30-590FAF5B7187}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tune", "tune\tune.vcxproj", "{80E08750-5C6C-492E-BB1E-7200978AE125}" ProjectSection(ProjectDependencies) = postProject {CD555681-D65B-4173-A29C-B8BF06A4871B} = {CD555681-D65B-4173-A29C-B8BF06A4871B} {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} = {C0E2EA85-996A-4B5F-AD30-590FAF5B7187} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bench_mulredc", "bench_mulredc\bench_mulredc.vcxproj", "{4727DE12-787D-432D-B166-BF103B0C3C87}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 Release|Win32 = Release|Win32 Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.ActiveCfg = Debug|Win32 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|Win32.Build.0 = Debug|Win32 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.ActiveCfg = Debug|x64 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Debug|x64.Build.0 = Debug|x64 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.ActiveCfg = Release|Win32 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|Win32.Build.0 = Release|Win32 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.ActiveCfg = Release|x64 {CD555681-D65B-4173-A29C-B8BF06A4871B}.Release|x64.Build.0 = Release|x64 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.ActiveCfg = Debug|Win32 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|Win32.Build.0 = Debug|Win32 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.ActiveCfg = Debug|x64 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Debug|x64.Build.0 = Debug|x64 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.ActiveCfg = Release|Win32 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|Win32.Build.0 = Release|Win32 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.ActiveCfg = Release|x64 {C0E2EA85-996A-4B5F-AD30-590FAF5B7187}.Release|x64.Build.0 = Release|x64 {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|Win32.ActiveCfg = Release|x64 {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.ActiveCfg = Release|x64 {80E08750-5C6C-492E-BB1E-7200978AE125}.Debug|x64.Build.0 = Release|x64 {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.ActiveCfg = Release|Win32 {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|Win32.Build.0 = Release|Win32 {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.ActiveCfg = Release|x64 {80E08750-5C6C-492E-BB1E-7200978AE125}.Release|x64.Build.0 = Release|x64 {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.ActiveCfg = Debug|Win32 {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|Win32.Build.0 = Debug|Win32 {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.ActiveCfg = Debug|x64 {4727DE12-787D-432D-B166-BF103B0C3C87}.Debug|x64.Build.0 = Debug|x64 {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.ActiveCfg = Release|Win32 {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|Win32.Build.0 = Release|Win32 {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.ActiveCfg = Release|x64 {4727DE12-787D-432D-B166-BF103B0C3C87}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection EndGlobal ecm-6.4.4/build.vc10/mp_lib.props0000644023561000001540000000127012106741271013412 00000000000000 mpir\ mpir.lib <_ProjectFileVersion>10.0.30128.1 $(mp_dir) true $(mp_lib) true ecm-6.4.4/COPYING.LIB0000644023561000001540000001672712106741274010674 00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. ecm-6.4.4/test_mulredc.c0000644023561000001540000001750112106741273012060 00000000000000#include "config.h" #include #include #include #include #include "mulredc.h" void mp_print(const mp_limb_t *x, const int N) { int i; for (i = 0; i < N; ++i) { if (i>0) printf (" + "); printf("%lu", x[i]); if (i>0) printf ("*2^%d", i*GMP_NUMB_BITS); } printf("\n"); } static mp_limb_t call_mulredc (const int N, mp_limb_t *z, const mp_limb_t *x, const mp_limb_t *y, const mp_limb_t *m, const mp_limb_t invm) { mp_limb_t cy; switch (N) { case 1: cy = mulredc1(z, x[0], y[0], m[0], invm); break; case 2: cy = mulredc2(z, x, y, m, invm); break; case 3: cy = mulredc3(z, x, y, m, invm); break; case 4: cy = mulredc4(z, x, y, m, invm); break; case 5: cy = mulredc5(z, x, y, m, invm); break; case 6: cy = mulredc6(z, x, y, m, invm); break; case 7: cy = mulredc7(z, x, y, m, invm); break; case 8: cy = mulredc8(z, x, y, m, invm); break; case 9: cy = mulredc9(z, x, y, m, invm); break; case 10: cy = mulredc10(z, x, y, m, invm); break; case 11: cy = mulredc11(z, x, y, m, invm); break; case 12: cy = mulredc12(z, x, y, m, invm); break; case 13: cy = mulredc13(z, x, y, m, invm); break; case 14: cy = mulredc14(z, x, y, m, invm); break; case 15: cy = mulredc15(z, x, y, m, invm); break; case 16: cy = mulredc16(z, x, y, m, invm); break; case 17: cy = mulredc17(z, x, y, m, invm); break; case 18: cy = mulredc18(z, x, y, m, invm); break; case 19: cy = mulredc19(z, x, y, m, invm); break; case 20: cy = mulredc20(z, x, y, m, invm); break; default: cy = mulredc20(z, x, y, m, invm); } return cy; } #if defined(HAVE_NATIVE_MULREDC1_N) static mp_limb_t call_mulredc1 (const int N, mp_limb_t *z, const mp_limb_t x, const mp_limb_t *y, const mp_limb_t *m, const mp_limb_t invm) { mp_limb_t cy; switch (N) { case 1: cy = mulredc1(z, x, y[0], m[0], invm); break; case 2: cy = mulredc1_2(z, x, y, m, invm); break; case 3: cy = mulredc1_3(z, x, y, m, invm); break; case 4: cy = mulredc1_4(z, x, y, m, invm); break; case 5: cy = mulredc1_5(z, x, y, m, invm); break; case 6: cy = mulredc1_6(z, x, y, m, invm); break; case 7: cy = mulredc1_7(z, x, y, m, invm); break; case 8: cy = mulredc1_8(z, x, y, m, invm); break; case 9: cy = mulredc1_9(z, x, y, m, invm); break; case 10: cy = mulredc1_10(z, x, y, m, invm); break; case 11: cy = mulredc1_11(z, x, y, m, invm); break; case 12: cy = mulredc1_12(z, x, y, m, invm); break; case 13: cy = mulredc1_13(z, x, y, m, invm); break; case 14: cy = mulredc1_14(z, x, y, m, invm); break; case 15: cy = mulredc1_15(z, x, y, m, invm); break; case 16: cy = mulredc1_16(z, x, y, m, invm); break; case 17: cy = mulredc1_17(z, x, y, m, invm); break; case 18: cy = mulredc1_18(z, x, y, m, invm); break; case 19: cy = mulredc1_19(z, x, y, m, invm); break; case 20: cy = mulredc1_20(z, x, y, m, invm); break; default: cy = mulredc1_20(z, x, y, m, invm); } return cy; } #endif void test(mp_size_t N, int k) { mp_limb_t *x, *y, *yp, *z, *m, invm, cy, cy2, *tmp, *tmp2, *tmp3; int i, j; x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); z = (mp_limb_t *) malloc((N+1)*sizeof(mp_limb_t)); m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); tmp2 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); tmp3 = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); if (x == NULL || y == NULL || z == NULL || m == NULL || tmp == NULL || tmp2 == NULL || tmp3 == NULL) { fprintf (stderr, "Cannot allocate memory in test_mulredc\n"); exit (1); } mpn_random2(m, N); m[0] |= 1UL; if (m[N-1] == 0) m[N-1] = 1UL; invm = 1UL; for (i = 0; i < 10; ++i) invm = (2*invm-m[0]*invm*invm); invm = -invm; assert( (invm*m[0] +1UL) == 0UL); yp = y; for (i=0; i < k; ++i) { /* Try a few special cases */ if (i == 0) { /* Try all 0, product should be 0 */ for (j = 0; j < N; j++) x[j] = y[j] = 0; } else if (i == 1) { /* Try all 1 */ for (j = 0; j < N; j++) x[j] = y[j] = 1; } else if (i == 2) { /* Try all 2^wordsize - 1 */ for (j = 0; j < N; j++) x[j] = y[j] = ~(0UL); } else { /* In the other cases, try random data */ if (i % 2 == 0) { /* Try squaring */ mpn_random2(x, N); yp = x; } else { /* Try multiplication */ mpn_random2(x, N); mpn_random2(y, N); } } /* Mixed mul and redc */ cy = call_mulredc (N, z, x, yp, m, invm); if (cy) printf("!"); z[N] = cy; /* Check with pure gmp : multiply by 2^(N*GMP_NUMB_BITS) and compare. */ for (j=0; j < N; ++j) { tmp[j] = 0; tmp[j+N] = z[j]; } tmp[2*N] = z[N]; mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N+1, m, N); for (j=0; j < N; ++j) z[j] = tmp3[j]; mpn_mul_n(tmp, x, yp, N); mpn_tdiv_qr(tmp2, tmp3, 0, tmp, 2*N, m, N); assert(mpn_cmp(z, tmp3, N) == 0); #if defined(HAVE_NATIVE_MULREDC1_N) /* Test mulredc1_n() */ z[N] = call_mulredc1 (N, z, x[0], yp, m, invm); tmp[0] = 0; for (j=0; j <= N; ++j) /* Multiply by 2^GMP_NUMB_BITS */ tmp[j+1] = z[j]; mpn_tdiv_qr(tmp2, tmp3, 0, tmp, N+2, m, N); for (j=0; j < N; ++j) z[j] = tmp3[j]; tmp[N] = mpn_mul_1 (tmp, yp, N, x[0]); mpn_tdiv_qr(tmp2, tmp3, 0, tmp, N+1, m, N); assert(mpn_cmp(z, tmp3, N) == 0); #endif } free(tmp); free(tmp2); free(tmp3); free(x); free(y); free(z); free(m); } int main(int argc, char** argv) { int i, len; if (argc > 1) /* Test a specific length */ { len = atoi (argv[1]); for (i = 0; i < 1; i++) test (len, 1000000); return 0; } for (;;) { for (i = 1; i <= 20; ++i) { test(i, 1000); } #if 0 test(1, 1000); test(2, 1000); test(3, 1000); test(4, 1000); test(5, 1000); test(6, 1000); test(7, 1000); test(8, 1000); test(9, 1000); test(10, 1000); test(11, 1000); test(12, 1000); test(13, 100); test(14, 100); test(15, 100); test(16, 100); test(17, 100); test(18, 100); test(44, 10); test(45, 10); test(46, 10); test(47, 10); test(48, 10); test(49, 10); #endif printf("."); fflush(stdout); } #if 0 x[0] = 12580274668139321508UL; x[1] = 9205793975152560417UL; x[2] = 7857372727033793057UL; y[0] = 13688385828267279103UL; y[1] = 10575011835742767258UL; y[2] = 8802048318027595690UL; m[0] = 2981542467342508025UL; m[1] = 5964669706257742025UL; m[2] = 18446744073678090270UL; invm = 9419286575570128311UL; carry = mulredc(z, x, y, m, 3, invm); printf("%lu + 2^64*(%lu + 2^64*%lu), carry=%lu\n", z[0], z[1], z[2], carry); #endif return 0; } #if 0 W := 2^64; x0:= 12580274668139321508; x1:= 9205793975152560417; x2:= 7857372727033793057; x := x0 + W*(x1 + W*x2); y0:= 13688385828267279103; y1:= 10575011835742767258; y2:= 8802048318027595690; y := y0 + W*(y1 + W*y2); m0:= 2981542467342508025; m1:= 5964669706257742025; m2:= 18446744073678090270; m := m0 + W*(m1 + W*m2); invm := 9419286575570128311; #endif ecm-6.4.4/pm1.c0000644023561000001540000010422212106741274010061 00000000000000/* Pollard 'P-1' algorithm. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Paul Zimmermann and Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-impl.h" #define CASCADE_THRES 3 #define CASCADE_MAX 50000000.0 #ifndef POWM_THRESHOLD #define POWM_THRESHOLD 100 #endif typedef struct { unsigned int size; mpz_t *val; } mul_casc; /****************************************************************************** * * * Stage 1 * * * ******************************************************************************/ /* prime powers are accumulated up to about n^L1 */ #define L1 16 /*** Cascaded multiply ***/ /* return NULL if an error occurred */ static mul_casc * mulcascade_init (void) { mul_casc *t; t = (mul_casc *) malloc (sizeof (mul_casc)); if (t == NULL) { outputf (OUTPUT_ERROR, "mulcascade_init: could not allocate memory\n"); return NULL; } t->val = (mpz_t*) malloc (sizeof (mpz_t)); if (t->val == NULL) { outputf (OUTPUT_ERROR, "mulcascade_init: could not allocate memory\n"); free (t); return NULL; } mpz_init (t->val[0]); t->size = 1; return t; } static void mulcascade_free (mul_casc *c) { unsigned int i; for (i = 0; i < c->size; i++) mpz_clear (c->val[i]); free (c->val); free (c); } static mul_casc * mulcascade_mul_d (mul_casc *c, const double n, ATTRIBUTE_UNUSED mpz_t t) { unsigned int i; if (mpz_sgn (c->val[0]) == 0) { mpz_set_d (c->val[0], n); return c; } mpz_mul_d (c->val[0], c->val[0], n, t); if (mpz_size (c->val[0]) <= CASCADE_THRES) return c; for (i = 1; i < c->size; i++) { if (mpz_sgn (c->val[i]) == 0) { mpz_set (c->val[i], c->val[i-1]); mpz_set_ui (c->val[i-1], 0); return c; } else { mpz_mul (c->val[i], c->val[i], c->val[i-1]); mpz_set_ui (c->val[i-1], 0); } } /* Allocate more space for cascade */ i = c->size++; c->val = (mpz_t*) realloc (c->val, c->size * sizeof (mpz_t)); if (c->val == NULL) { fprintf (stderr, "Cannot allocate memory in mulcascade_mul_d\n"); exit (1); } mpz_init (c->val[i]); mpz_swap (c->val[i], c->val[i-1]); return c; } static mul_casc * mulcascade_mul (mul_casc *c, mpz_t n) { unsigned int i; if (mpz_sgn (c->val[0]) == 0) { mpz_set (c->val[0], n); return c; } mpz_mul (c->val[0], c->val[0], n); if (mpz_size (c->val[0]) <= CASCADE_THRES) return c; for (i = 1; i < c->size; i++) { if (mpz_sgn (c->val[i]) == 0) { mpz_set (c->val[i], c->val[i-1]); mpz_set_ui (c->val[i-1], 0); return c; } else { mpz_mul (c->val[i], c->val[i], c->val[i-1]); mpz_set_ui (c->val[i-1], 0); } } /* Allocate more space for cascade */ i = c->size++; c->val = (mpz_t*) realloc (c->val, c->size * sizeof (mpz_t)); if (c->val == NULL) { fprintf (stderr, "Cannot allocate memory in mulcascade_mul\n"); exit (1); } mpz_init (c->val[i]); mpz_swap (c->val[i], c->val[i-1]); return c; } static void mulcascade_get_z (mpz_t r, mul_casc *c) { unsigned int i; if (c->size == 0) { mpz_set_ui (r, 1); /* Empty product */ return; } mpz_set_ui (r, 1); for (i = 0; i < c->size; i++) if (mpz_sgn (c->val[i]) != 0) mpz_mul (r, r, c->val[i]); } /* Input: a is the generator (sigma) n is the number to factor B1 is the stage 1 bound B1done: stage 1 was already done up to that limit go is the group order to preload Output: f is the factor found, a is the value at end of stage 1 B1done is set to B1 if stage 1 completed normally, or to the largest prime processed if interrupted, but never to a smaller value than B1done was upon function entry. Return value: non-zero iff a factor was found (or an error occurred). */ static int pm1_stage1 (mpz_t f, mpres_t a, mpmod_t n, double B1, double *B1done, mpz_t go, int (*stop_asap)(void), char *chkfilename) { double p, q, r, cascade_limit, last_chkpnt_p; mpz_t g, d; int youpi = ECM_NO_FACTOR_FOUND; unsigned int size_n, max_size; unsigned int smallbase = 0; mul_casc *cascade; long last_chkpnt_time; const double B0 = sqrt (B1); mpz_init (g); mpz_init (d); size_n = mpz_sizeinbase (n->orig_modulus, 2); max_size = L1 * size_n; mpres_get_z (g, a, n); if (mpz_fits_uint_p (g)) smallbase = mpz_get_ui (g); /* suggestion from Peter Montgomery: start with exponent n-1, since any prime divisor of b^m-1 which does not divide any algebraic factor of b^m-1 must be of the form km+1 [Williams82]. Do this only when n is composite, otherwise all tests with prime n factor of a Cunningham number will succeed in stage 1. Since mpz_probab_prime_p and a^(n-1) mod n require about lg(n) modular multiplications, and P-1 perform about B1 modular multiplications, to ensure small overhead, use that trick only when lg(n) <= sqrt(B1). */ /* For now, this p^N-1 is left in. We might want it out at a later time */ if ((double) size_n <= B0 && mpz_probab_prime_p (n->orig_modulus, PROBAB_PRIME_TESTS) == 0) { mpz_sub_ui (g, n->orig_modulus, 1); mpres_pow (a, a, g, n); } else mpz_set_ui (g, 1); /* Set a limit of roughly 10000 * log_10(N) for the primes that are multiplied up in the exponent, i.e. 1M for a 100 digit number, but limit to CASCADE_MAX to avoid problems with stack allocation */ cascade_limit = 3000.0 * (double) size_n; if (cascade_limit > CASCADE_MAX) cascade_limit = CASCADE_MAX; if (cascade_limit > B1) cascade_limit = B1; cascade = mulcascade_init (); if (cascade == NULL) { youpi = ECM_ERROR; goto clear_pm1_stage1; } /* since B0 = sqrt(B1), we can have B0 > cascade_limit only when B1 > cascade_limit^2. This cannot happen when cascade_limit=B1, thus we need B1 > min(CASCADE_MAX, 3000*sizeinbase(n,2))^2. For sizeinbase(n,2) <= CASCADE_MAX/3000 (less than 5017 digits for CASCADE_MAX=5e7) this means B1 > 9e6*sizeinbase(n,2)^2. For sizeinbase(n,2) > CASCADE_MAX/3000, this means B1 > CASCADE_MAX^2, i.e. B1 > 25e14 for CASCADE_MAX=5e7. */ /* if the user knows that P-1 has a given divisor, he can supply it */ if (mpz_cmp_ui (go, 1) > 0) cascade = mulcascade_mul (cascade, go); last_chkpnt_time = cputime (); last_chkpnt_p = 2.; /* Fill the multiplication cascade with the product of small stage 1 primes */ /* Add small primes <= MIN(sqrt(B1), cascade_limit) in the appropriate power to the cascade */ for (p = 2.; p <= MIN(B0, cascade_limit); p = getprime ()) { for (q = 1., r = p; r <= B1; r *= p) if (r > *B1done) q *= p; cascade = mulcascade_mul_d (cascade, q, d); } /* If B0 < cascade_limit, we can add some primes > sqrt(B1) with exponent 1 to the cascade */ for ( ; p <= cascade_limit; p = getprime ()) if (p > *B1done) cascade = mulcascade_mul_d (cascade, p, d); /* Now p > cascade_limit, flush cascade and exponentiate */ mulcascade_get_z (g, cascade); mulcascade_free (cascade); outputf (OUTPUT_DEVVERBOSE, "Exponent has %u bits\n", mpz_sizeinbase (g, 2)); if (smallbase) { outputf (OUTPUT_DEVVERBOSE, "Using mpres_ui_pow, base %u\n", smallbase); mpres_ui_pow (a, smallbase, g, n); } else { mpres_pow (a, a, g, n); } mpz_set_ui (g, 1); /* If B0 > cascade_limit, we need to process the primes cascade_limit < p < B0 in the appropriate exponent yet */ for ( ; p <= B0; p = getprime ()) { for (q = 1, r = p; r <= B1; r *= p) if (r > *B1done) q *= p; mpz_mul_d (g, g, q, d); if (mpz_sizeinbase (g, 2) >= max_size) { mpres_pow (a, a, g, n); mpz_set_ui (g, 1); if (stop_asap != NULL && (*stop_asap) ()) { outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); if (p > *B1done) *B1done = p; goto clear_pm1_stage1; } } } /* All primes sqrt(B1) < p <= B1 appear in exponent 1. All primes <= B1done are already included in exponent of at least 1, so it's save to skip ahead to B1done+1 */ if (*B1done > p) { getprime_seek ((*B1done) + 1.); p = getprime (); } /* then remaining primes > max(sqrt(B1), cascade_limit) and taken with exponent 1 */ for (; p <= B1; p = getprime ()) { mpz_mul_d (g, g, p, d); if (mpz_sizeinbase (g, 2) >= max_size) { mpres_pow (a, a, g, n); mpz_set_ui (g, 1); if (stop_asap != NULL && (*stop_asap) ()) { outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); if (p > *B1done) *B1done = p; goto clear_pm1_stage1; } if (chkfilename != NULL && p > last_chkpnt_p + 10000. && elltime (last_chkpnt_time, cputime ()) > CHKPNT_PERIOD) { writechkfile (chkfilename, ECM_PM1, p, n, NULL, a, NULL); last_chkpnt_p = p; last_chkpnt_time = cputime (); } } } mpres_pow (a, a, g, n); /* If stage 1 finished normally, p is the smallest prime >B1 here. In that case, set to B1 */ if (p > B1) p = B1; if (p > *B1done) *B1done = p; mpres_sub_ui (a, a, 1, n); mpres_gcd (f, a, n); if (mpz_cmp_ui (f, 1) > 0) youpi = ECM_FACTOR_FOUND_STEP1; mpres_add_ui (a, a, 1, n); clear_pm1_stage1: if (chkfilename != NULL) writechkfile (chkfilename, ECM_PM1, *B1done, n, NULL, a, NULL); getprime_clear (); /* free the prime tables, and reinitialize */ mpz_clear (d); mpz_clear (g); return youpi; } /****************************************************************************** * * * Stage 2 * * * ******************************************************************************/ /* For each of the nr progressions each of S+1 entries in fd[], performs the update fd[k] *= fd[k+1], 0 <= k < S+1. */ static void update_fd (mpres_t *fd, unsigned int nr, unsigned int S, mpmod_t modulus, unsigned long *muls) { unsigned int j, k; for (j = 0; j < nr * (S + 1); j += S + 1) for (k = 0; k < S; k++) mpres_mul (fd[j + k], fd[j + k], fd[j + k + 1], modulus); if (muls != NULL) *muls += (unsigned long) nr * S; } /* Puts in F[0..dF-1] the successive values of x^(Dickson_{S, a}(j * d2)) for j == 1 mod 6 , j and d1 coprime, where Dickson_{S, a} is the degree S Dickson polynomial with parameter a. For a == 0, Dickson_{S, a} (x) = x^S. Uses the x+1/x trick whenever S > 6 and even, then the Dickson parameter a must be 0. Requires (dF+1) cells in t for the x+1/x trick. Returns non-zero iff a factor was found (then stored in f), or an error occurred. */ int pm1_rootsF (mpz_t f, listz_t F, root_params_t *root_params, unsigned long dF, mpres_t *x, listz_t t, mpmod_t modulus) { unsigned long i; unsigned long muls = 0, gcds = 0; long st, st1; pm1_roots_state_t state; progression_params_t *params = &state.params; /* for less typing */ listz_t coeffs; mpz_t ts; if (dF == 0) return 0; st = cputime (); /* Relative cost of point add during init and computing roots assumed =1 */ init_roots_params (&state.params, root_params->S, root_params->d1, root_params->d2, 1.0); /* The invtrick is profitable for x^S, S even and > 6. Does not work for Dickson polynomials (root_params->S < 0)! */ if (root_params->S > 6 && (root_params->S & 1) == 0) { state.invtrick = 1; params->S /= 2; params->size_fd = params->nr * (params->S + 1); } else state.invtrick = 0; outputf (OUTPUT_DEVVERBOSE, "pm1_rootsF: state: nr = %d, dsieve = %d, size_fd = %d, S = %d, " "dickson_a = %d, invtrick = %d\n", params->nr, params->dsieve, params->size_fd, params->S, params->dickson_a, state.invtrick); /* Init finite differences tables */ mpz_init (ts); /* ts = 0 */ coeffs = init_progression_coeffs (ts, params->dsieve, root_params->d2, 1, 6, params->S, params->dickson_a); mpz_clear (ts); if (coeffs == NULL) return ECM_ERROR; /* Allocate memory for fd[] and compute x^coeff[]*/ state.fd = (mpres_t *) malloc (params->size_fd * sizeof (mpres_t)); if (state.fd == NULL) { clear_list (coeffs, params->size_fd); return ECM_ERROR; } for (i = 0; i < params->size_fd; i++) { outputf (OUTPUT_TRACE, "pm1_rootsF: coeffs[%d] = %Zd\n", i, coeffs[i]); mpres_init (state.fd[i], modulus); /* The highest coefficient of all progressions is identical */ if (i > params->S + 1 && i % (params->S + 1) == params->S) { ASSERT (mpz_cmp (coeffs[i], coeffs[params->S]) == 0); mpres_set (state.fd[i], state.fd[params->S], modulus); } else mpres_pow (state.fd[i], *x, coeffs[i], modulus); } clear_list (coeffs, params->size_fd); coeffs = NULL; st1 = cputime (); outputf (OUTPUT_VERBOSE, "Initializing table of differences for F took %ldms\n", elltime (st, st1)); st = st1; /* Now for the actual calculation of the roots. */ for (i = 0; i < dF;) { /* Is this a rsieve value where we computed x^Dickson(j * d2) ? */ if (gcd (params->rsieve, params->dsieve) == 1) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ update_fd (state.fd, params->nr, params->S, modulus, &muls); params->next = 0; } /* Is this a j value where we want x^Dickson(j * d2) as a root? */ if (gcd (params->rsieve, root_params->d1) == 1) mpres_get_z (F[i++], state.fd[params->next * (params->S + 1)], modulus); params->next ++; } params->rsieve += 6; } for (i = 0; i < params->size_fd; i++) mpres_clear (state.fd[i], modulus); free (state.fd); state.fd = NULL; if (state.invtrick) { if (list_invert (t, F, dF, t[dF], modulus)) { /* Should never happen */ outputf (OUTPUT_ERROR, "Found factor unexpectedly while inverting F[0]*..*F[dF]\n"); mpz_set (f, t[dF]); return ECM_FACTOR_FOUND_STEP2; } muls += 3 * (dF - 1); gcds ++; for (i = 0; i < dF; i++) { mpz_add (F[i], F[i], t[i]); mpz_mod (F[i], F[i], modulus->orig_modulus); } } outputf (OUTPUT_VERBOSE, "Computing roots of F took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); return ECM_NO_FACTOR_FOUND; } /* Perform the necessary initialisation to allow computation of x^(Dickson_{S, a}(s+n*d)) for successive n, where Dickson_{S, a} is the degree S Dickson polynomial with parameter a. For a == 0, Dickson_{S, a} (x) = x^S. Uses the x+1/x trick whenever S > 6 and even. Return NULL if an error occurred. */ pm1_roots_state_t * pm1_rootsG_init (mpres_t *x, root_params_t *root_params, mpmod_t modulus) { unsigned int i; listz_t coeffs; pm1_roots_state_t *state; progression_params_t *params; /* for less typing */ state = (pm1_roots_state_t *) malloc (sizeof (pm1_roots_state_t)); if (state == NULL) return NULL; params = &(state->params); params->dickson_a = (root_params->S < 0) ? -1 : 0; params->nr = (root_params->d2 > 1) ? root_params->d2 - 1 : 1; params->next = 0; state->invtrick = (root_params->S > 6 && (root_params->S & 1) == 0); params->S = (state->invtrick) ? abs (root_params->S) / 2 : abs (root_params->S); params->size_fd = params->nr * (params->S + 1); params->dsieve = 1; params->rsieve = 1; outputf (OUTPUT_DEVVERBOSE, "pm1_rootsG_init: d1 = %lu, d2 = %lu, state: dsieve = %d, " "nr = %d, size_fd = %d, S = %d, invtrick = %d\n", root_params->d1, root_params->d2, params->dsieve, params->nr, params->size_fd, params->S, state->invtrick); state->fd = (mpres_t *) malloc (params->size_fd * sizeof (mpres_t)); if (state->fd == NULL) { free (state); return NULL; } /* Init for Dickson_{E,a} (i0 * d + d1 * n) */ coeffs = init_progression_coeffs (root_params->i0, root_params->d2, root_params->d1, 1, 1, params->S, params->dickson_a); if (coeffs == NULL) { free (state->fd); free (state); return NULL; } for (i = 0; i < params->size_fd; i++) { outputf (OUTPUT_TRACE, "pm1_rootsG_init: coeffs[%d] = %Zd\n", i, coeffs[i]); mpres_init (state->fd[i], modulus); /* The S-th coeff of all progressions is identical */ if (i > params->S && i % (params->S + 1) == params->S) { ASSERT (mpz_cmp (coeffs[i], coeffs[params->S]) == 0); /* Simply copy from the first progression */ mpres_set (state->fd[i], state->fd[params->S], modulus); } else { if (mpz_sgn (coeffs[i]) < 0) { mpz_neg (coeffs[i], coeffs[i]); mpres_pow (state->fd[i], *x, coeffs[i], modulus); mpres_invert (state->fd[i], state->fd[i], modulus); mpz_neg (coeffs[i], coeffs[i]); } else { mpres_pow (state->fd[i], *x, coeffs[i], modulus); } } } clear_list (coeffs, params->size_fd); return state; } /* Frees all the dynamic variables allocated by pm1_rootsG_init() */ void pm1_rootsG_clear (pm1_roots_state_t *state, ATTRIBUTE_UNUSED mpmod_t modulus) { unsigned int k; for (k = 0; k < state->params.size_fd; k++) mpres_clear (state->fd[k], modulus); free (state->fd); state->fd = NULL; free (state); } /* Puts in G the successive values of x^(Dickson_{S, a}(s+j*k)) for 1 <= j <= d, where k is the 'd' value from pm1_rootsG_init() and s is the 's' value of pm1_rootsG_init() or where a previous call to pm1_rootsG has left off. Requires (d+1) cells in t for the x+1/x trick. Returns non-zero iff a factor was found (then stored in f). No error can occur. */ int pm1_rootsG (mpz_t f, listz_t G, unsigned long dF, pm1_roots_state_t *state, listz_t t, mpmod_t modulus) { unsigned long i; unsigned long muls = 0, gcds = 0; unsigned int st; progression_params_t *params = &(state->params); /* for less typing */ outputf (OUTPUT_TRACE, "pm1_rootsG: dF = %d, state: size_fd = %d, nr = %d, S = %d\n", dF, params->size_fd, params->nr, params->S); st = cputime (); for (i = 0; i < dF;) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ outputf (OUTPUT_TRACE, "pm1_rootsG: Updating table at rsieve = %d\n", params->rsieve); update_fd (state->fd, params->nr, params->S, modulus, &muls); params->next = 0; } /* Is this a root we should skip? (Take only if gcd == 1) */ if (gcd (params->rsieve, params->dsieve) == 1) { outputf (OUTPUT_TRACE, "pm1_rootsG: Taking root G[%d] at rsieve = %d\n", i, params->rsieve); mpres_get_z (G[i++], state->fd[params->next * (params->S + 1)], modulus); } else outputf (OUTPUT_TRACE, "pm1_rootsG: Skipping root at rsieve = %d\n", params->rsieve); params->next ++; params->rsieve ++; } if (state->invtrick) { if (list_invert (t, G, dF, t[dF], modulus)) { outputf (OUTPUT_VERBOSE, "Found factor while inverting G[0]*..*G[d]\n"); mpz_set (f, t[dF]); return ECM_FACTOR_FOUND_STEP2; } muls += 3 * (dF - 1); gcds ++; for (i = 0; i < dF; i++) { mpz_add (G[i], G[i], t[i]); mpz_mod (G[i], G[i], modulus->orig_modulus); } } outputf (OUTPUT_VERBOSE, "Computing roots of G took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); return ECM_NO_FACTOR_FOUND; } static void print_prob (double B1, const mpz_t B2, unsigned long dF, unsigned long k, int S, const mpz_t go) { double prob; int i; char sep; outputf (OUTPUT_VERBOSE, "Probability of finding a factor of n digits:\n"); if (go != NULL && mpz_cmp_ui (go, 1UL) <= 0) outputf (OUTPUT_VERBOSE, "(Use -go parameter to specify known factors in P-1)\n"); outputf (OUTPUT_VERBOSE, "20\t25\t30\t35\t40\t45\t50\t55\t60\t65\n"); for (i = 20; i <= 65; i += 5) { sep = (i < 65) ? '\t' : '\n'; prob = pm1prob (B1, mpz_get_d (B2), pow (10., i - .5), (double) dF * dF * k, S, go); outputf (OUTPUT_VERBOSE, "%.2g%c", prob, sep); } } /****************************************************************************** * * * Pollard P-1 * * * ******************************************************************************/ /* Input: p is the initial generator (sigma), if 0, generate it at random. N is the number to factor B1 is the stage 1 bound B2 is the stage 2 bound B1done is the stage 1 limit to which supplied residue has already been computed k is the number of blocks for stage 2 verbose is the verbosity level Output: f is the factor found, p is the residue at end of stage 1 Return value: non-zero iff a factor is found (1 for stage 1, 2 for stage 2) */ int pm1 (mpz_t f, mpz_t p, mpz_t N, mpz_t go, double *B1done, double B1, mpz_t B2min_parm, mpz_t B2_parm, double B2scale, unsigned long k, const int S, int verbose, int repr, int use_ntt, FILE *os, FILE *es, char *chkfilename, char *TreeFilename, double maxmem, gmp_randstate_t rng, int (*stop_asap)(void)) { int youpi = ECM_NO_FACTOR_FOUND; int base2 = 0; int Nbits, smallbase; int po2 = 0; /* Whether we should use power-of-2 poly degree */ long st; mpmod_t modulus; mpres_t x; mpz_t B2min, B2; /* Local B2, B2min to avoid changing caller's values */ unsigned long dF; root_params_t root_params; faststage2_param_t faststage2_params; /* If stage2_variant != 0, we use the new fast stage 2 */ const int stage2_variant = (S == 1 || S == ECM_DEFAULT_S); set_verbose (verbose); ECM_STDOUT = (os == NULL) ? stdout : os; ECM_STDERR = (es == NULL) ? stdout : es; /* if n is even, return 2 */ if (mpz_divisible_2exp_p (N, 1)) { mpz_set_ui (f, 2); return ECM_FACTOR_FOUND_STEP1; } st = cputime (); if (mpz_cmp_ui (p, 0) == 0) pm1_random_seed (p, N, rng); mpz_init_set (B2min, B2min_parm); mpz_init_set (B2, B2_parm); /* Set default B2. See ecm.c for comments */ if (ECM_IS_DEFAULT_B2(B2)) { if (stage2_variant == 0) mpz_set_d (B2, B2scale * pow (B1 * PM1_COST, DEFAULT_B2_EXPONENT)); else mpz_set_d (B2, B2scale * pow (B1 * PM1FS2_COST, PM1FS2_DEFAULT_B2_EXPONENT)); } /* set B2min */ if (mpz_sgn (B2min) < 0) mpz_set_d (B2min, B1); if (repr != ECM_MOD_DEFAULT && repr != ECM_MOD_NOBASE2) { if (repr == ECM_MOD_MODMULN) mpmod_init_MODMULN (modulus, N); else if (repr == ECM_MOD_REDC) mpmod_init_REDC (modulus, N); else if (abs (repr) > 16) { if (mpmod_init_BASE2 (modulus, repr, N) == ECM_ERROR) return ECM_ERROR; } else mpmod_init_MPZ (modulus, N); } else /* automatic choice */ { /* Find a good arithmetic for this number */ Nbits = mpz_sizeinbase (N, 2); base2 = (repr == 0) ? isbase2 (N, BASE2_THRESHOLD) : 0; smallbase = mpz_fits_uint_p (p); /* TODO: make dependent on Nbits and base2 */ if (base2) { mpmod_init_BASE2 (modulus, base2, N); } else if (mpz_size (N) <= 2 * POWM_THRESHOLD && smallbase && B1 <= 1e6) /* Below POWM_THRESHOLD, mpz_powm uses MODMULN reduction, too, but without special code for small bases which makes our MODMULN faster. Above POWM_THRESHOLD mpz_powm uses faster mod reduction, at about 2*POWM_THRESHOLD it catches up with our smallbase-MODMULN and then is faster until REDC takes over. */ { outputf (OUTPUT_VERBOSE, "Using MODMULN\n"); mpmod_init_MODMULN (modulus, N); } else if (Nbits > 50000 || (Nbits > 3500 && smallbase)) { outputf (OUTPUT_VERBOSE, "Using REDC\n"); mpmod_init_REDC (modulus, N); } else { outputf (OUTPUT_VERBOSE, "Using mpz_powm\n"); mpmod_init_MPZ (modulus, N); } } /* Determine parameters (polynomial degree etc.) */ if (stage2_variant != 0) { long P_ntt, P_nontt; const unsigned long lmax = 1UL<<28; /* An upper bound */ unsigned long lmax_NTT, lmax_noNTT; faststage2_param_t params_ntt, params_nontt, *better_params; mpz_init (faststage2_params.m_1); faststage2_params.l = 0; mpz_init (params_ntt.m_1); params_ntt.l = 0; mpz_init (params_nontt.m_1); params_nontt.l = 0; /* Find out what the longest transform length is we can do at all. If no maxmem is given, the non-NTT can theoretically do any length. */ lmax_NTT = 0; if (use_ntt) { unsigned long t; /* See what transform length the NTT can handle (due to limited primes and limited memory) */ t = mpzspm_max_len (N); lmax_NTT = MIN (lmax, t); if (maxmem != 0.) { t = pm1fs2_maxlen (double_to_size (maxmem), N, use_ntt); lmax_NTT = MIN (lmax_NTT, t); } outputf (OUTPUT_DEVVERBOSE, "NTT can handle lmax <= %lu\n", lmax_NTT); /* FIXME: if both ntt and no-ntt are tried, but finally ntt is preferred, the last B2 bound computed is that of no-ntt, which is thus wrong */ P_ntt = choose_P (B2min, B2, lmax_NTT, k, ¶ms_ntt, B2min, B2, 1, ECM_PM1); if (P_ntt != ECM_ERROR) outputf (OUTPUT_DEVVERBOSE, "Parameters for NTT: P=%lu, l=%lu\n", params_ntt.P, params_ntt.l); } else P_ntt = 0; /* or GCC complains about uninitialized var */ /* See what transform length the non-NTT code can handle */ lmax_noNTT = lmax; if (maxmem != 0.) { unsigned long t; t = pm1fs2_maxlen (double_to_size (maxmem), N, 0); lmax_noNTT = MIN (lmax_noNTT, t); outputf (OUTPUT_DEVVERBOSE, "non-NTT can handle lmax <= %lu\n", lmax_noNTT); } if (use_ntt != 2) P_nontt = choose_P (B2min, B2, lmax_noNTT, k, ¶ms_nontt, B2min, B2, 0, ECM_PM1); else P_nontt = ECM_ERROR; if (P_nontt != ECM_ERROR) outputf (OUTPUT_DEVVERBOSE, "Parameters for non-NTT: P=%lu, l=%lu\n", params_nontt.P, params_nontt.l); if (((!use_ntt || P_ntt == ECM_ERROR) && P_nontt == ECM_ERROR) || (use_ntt == 2 && P_ntt == ECM_ERROR)) { outputf (OUTPUT_ERROR, "Error: cannot choose suitable P value for your stage 2 " "parameters.\nTry a shorter B2min,B2 interval.\n"); mpz_clear (faststage2_params.m_1); mpz_clear (params_ntt.m_1); mpz_clear (params_nontt.m_1); return ECM_ERROR; } /* Now decide wether to take NTT or non-NTT. How to choose the better one is not an easy question. It will depend on the speed ratio between NTT/non-NTT code, their difference in memory use and available memory. For now, we choose the one that uses a longer transform length. FIXME: Write something not brain-dead here */ if (use_ntt == 0 || P_ntt == ECM_ERROR || (use_ntt == 1 && params_nontt.l > params_ntt.l)) { better_params = ¶ms_nontt; use_ntt = 0; } else { better_params = ¶ms_ntt; use_ntt = 1; } faststage2_params.P = better_params->P; faststage2_params.s_1 = better_params->s_1; faststage2_params.s_2 = better_params->s_2; faststage2_params.l = better_params->l; mpz_set (faststage2_params.m_1, better_params->m_1); mpz_clear (params_ntt.m_1); mpz_clear (params_nontt.m_1); if (maxmem != 0.) outputf (OUTPUT_VERBOSE, "Using lmax = %lu with%s NTT which takes " "about %luMB of memory\n", faststage2_params.l, (use_ntt) ? "" : "out", pm1fs2_memory_use (faststage2_params.l, N, use_ntt)/1048576); } else { mpz_init (root_params.i0); root_params.d2 = 0; /* Enable automatic choice of d2 */ if (use_ntt || (modulus->repr == ECM_MOD_BASE2 && modulus->Fermat > 0)) po2 = 1; if (bestD (&root_params, &k, &dF, B2min, B2, po2, use_ntt, maxmem, (TreeFilename != NULL), modulus) == ECM_ERROR) { youpi = ECM_ERROR; goto clear_and_exit; } root_params.S = S; /* Set default degree for Brent-Suyama extension */ if (root_params.S == ECM_DEFAULT_S) { if (modulus->repr == ECM_MOD_BASE2 && modulus->Fermat > 0) { /* For Fermat numbers, default is 2 (no Brent-Suyama) */ root_params.S = 2; } else { mpz_t t; mpz_init (t); mpz_sub (t, B2, B2min); if (mpz_cmp_d (t, 3.5e5) < 0) /* B1 < 50000 */ root_params.S = -4; /* Dickson polys give a slightly better chance of success */ else if (mpz_cmp_d (t, 1.1e7) < 0) /* B1 < 500000 */ root_params.S = -6; else if (mpz_cmp_d (t, 1.25e8) < 0) /* B1 < 3000000 */ root_params.S = 12; /* but for S>6, S-th powers are faster thanks to invtrick */ else if (mpz_cmp_d (t, 7.e9) < 0) /* B1 < 50000000 */ root_params.S = 24; else if (mpz_cmp_d (t, 1.9e10) < 0) /* B1 < 100000000 */ root_params.S = 48; else if (mpz_cmp_d (t, 5.e11) < 0) /* B1 < 1000000000 */ root_params.S = 60; else root_params.S = 120; mpz_clear (t); } } /* We need Suyama's power even and at least 2 for P-1 stage 2 to work correctly */ if (root_params.S & 1) root_params.S *= 2; /* FIXME: Is this what the user would expect? */ } /* Print B1, B2, polynomial and x0 */ print_B1_B2_poly (OUTPUT_NORMAL, ECM_PM1, B1, *B1done, B2min_parm, B2min, B2, (stage2_variant == 0) ? root_params.S : 1, p, 0, NULL); /* If we do a stage 2, print its parameters */ if (mpz_cmp (B2, B2min) >= 0) { if (stage2_variant != 0) outputf (OUTPUT_VERBOSE, "P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu, " "m_1 = %Zd\n", faststage2_params.P, faststage2_params.l, faststage2_params.s_1,faststage2_params.s_2, faststage2_params.m_1); else outputf (OUTPUT_VERBOSE, "dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", dF, k, root_params.d1, root_params.d2, root_params.i0); } if (test_verbose (OUTPUT_VERBOSE)) { if (mpz_sgn (B2min_parm) >= 0) { outputf (OUTPUT_VERBOSE, "Can't compute success probabilities for B1 <> B2min\n"); } else { rhoinit (256, 10); print_prob (B1, B2, dF, k, (stage2_variant == 0) ? root_params.S : 1, go); } } mpres_init (x, modulus); mpres_set_z (x, p, modulus); st = cputime (); if (B1 > *B1done) youpi = pm1_stage1 (f, x, modulus, B1, B1done, go, stop_asap, chkfilename); st = elltime (st, cputime ()); outputf (OUTPUT_NORMAL, "Step 1 took %ldms\n", st); if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_t tx; mpz_init (tx); mpres_get_z (tx, x, modulus); outputf (OUTPUT_RESVERBOSE, "x=%Zd\n", tx); mpz_clear (tx); } if (stop_asap != NULL && (*stop_asap) ()) goto clear_and_exit; if (youpi == ECM_NO_FACTOR_FOUND && mpz_cmp (B2, B2min) >= 0) { if (stage2_variant != 0) { if (use_ntt) youpi = pm1fs2_ntt (f, x, modulus, &faststage2_params); else youpi = pm1fs2 (f, x, modulus, &faststage2_params); } else youpi = stage2 (f, &x, modulus, dF, k, &root_params, ECM_PM1, use_ntt, TreeFilename, stop_asap); } if (test_verbose (OUTPUT_VERBOSE)) { if (mpz_sgn (B2min_parm) < 0) rhoinit (1, 0); /* Free memory of rhotable */ } clear_and_exit: mpres_get_z (p, x, modulus); mpres_clear (x, modulus); mpmod_clear (modulus); if (stage2_variant != 0) mpz_clear (faststage2_params.m_1); else mpz_clear (root_params.i0); mpz_clear (B2); mpz_clear (B2min); return youpi; } ecm-6.4.4/mpmod.c0000644023561000001540000021523412110710163010472 00000000000000/* Modular multiplication. Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Paul Zimmermann, Alexander Kruppa and Cyril Bouvier. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-gmp.h" #include "ecm-impl.h" #include "mpmod.h" #ifdef USE_ASM_REDC #include "mulredc.h" #endif FILE *ECM_STDOUT, *ECM_STDERR; /* define them here since needed in tune.c */ /* define WANT_ASSERT to check normalization of residues */ /* #define WANT_ASSERT 1 */ /* #define DEBUG */ /* #define WANT_ASSERT_EXPENSIVE 1 */ #define ASSERT_NORMALIZED(x) ASSERT ((modulus->repr != ECM_MOD_MODMULN && \ modulus->repr != ECM_MOD_REDC) || \ mpz_size (x) <= mpz_size (modulus->orig_modulus)) #define MPZ_NORMALIZED(x) ASSERT (PTR(x)[ABSIZ(x)-1] != 0) static void ecm_redc_basecase (mpz_ptr, mpz_ptr, mpmod_t) ATTRIBUTE_HOT; static void ecm_mulredc_basecase (mpres_t, const mpres_t, const mpres_t, mpmod_t) ATTRIBUTE_HOT; static void base2mod (mpres_t, const mpres_t, mpres_t, mpmod_t) ATTRIBUTE_HOT; static void REDC (mpres_t, const mpres_t, mpz_t, mpmod_t); /* Up from GMP 5.1.0, mpn_redc{1,2} do not subtract the modulus if needed, but return the carry of the final addition */ #ifdef HAVE___GMPN_REDC_1 #ifdef MPN_REDC12_RETURNS_CARRY #define REDC1(rp,cp,np,nn,invm) \ do {if (__gmpn_redc_1 (rp,cp,np,nn,invm)) \ mpn_sub_n (rp, rp, np, nn); \ } while(0) #else #define REDC1(rp,cp,np,nn,invm) __gmpn_redc_1(rp,cp,np,nn,invm) #endif #endif #ifdef HAVE___GMPN_REDC_2 #ifdef MPN_REDC12_RETURNS_CARRY #define REDC2(rp,cp,np,nn,invm) \ do {if (__gmpn_redc_2 (rp,cp,np,nn,invm)) \ mpn_sub_n (rp, rp, np, nn); \ } while (0) #else #define REDC2(rp,cp,np,nn,invm) __gmpn_redc_2(rp,cp,np,nn,invm) #endif #endif #if 0 /* PZ: commented out, since I don't see how to use this code. Indeed, we need a large enough value of K to get significant timings; however, for small B1 a too large value of K will increase the total time for a curve. */ /* return non-zero if base-2 division if better for n, with K multiplications */ static int mpmod_tune_base2 (const mpz_t n, int K, int base2) { mpmod_t modulus; int k; long t0, t1; mpres_t x; /* try first without base-2 division */ mpmod_init (modulus, n, ECM_MOD_NOBASE2, 0); mpres_init (x, modulus); mpres_set_z (x, n, modulus); mpres_sub_ui (x, x, 1, modulus); /* so that the initial value is dense */ t0 = cputime (); for (k = 0; k < K; k++) mpres_sqr (x, x, modulus); t0 = cputime () - t0; mpres_clear (x, modulus); mpmod_clear (modulus); /* now with base-2 division */ mpmod_init (modulus, n, ECM_MOD_BASE2, base2); mpres_init (x, modulus); mpres_set_z (x, n, modulus); mpres_sub_ui (x, x, 1, modulus); /* so that the initial value is dense */ t1 = cputime (); for (k = 0; k < K; k++) mpres_sqr (x, x, modulus); t1 = cputime () - t1; fprintf (stderr, "ECM_MOD_NOBASE2:%ld ECM_MOD_BASE2:%ld\n", t0, t1); mpres_clear (x, modulus); mpmod_clear (modulus); return (t1 < t0); } #endif /* returns +/-l if n is a factor of N = 2^l +/- 1 with N <= n^threshold, 0 otherwise. */ int isbase2 (const mpz_t n, const double threshold) { unsigned int k, lo; int res = 0; mpz_t u, w; MPZ_INIT (u); MPZ_INIT (w); lo = mpz_sizeinbase (n, 2) - 1; /* 2^lo <= n < 2^(lo+1) */ mpz_set_ui (u, 1UL); mpz_mul_2exp (u, u, 2UL * lo); mpz_mod (w, u, n); /* 2^(2lo) mod n = -/+2^(2lo-l) if m*n = 2^l+/-1 */ if (mpz_cmp_ui (w, 1UL) == 0) /* if 2^(2lo) mod n = 1, then n divides 2^(2lo)-1. If algebraic factors have been removed, n divides either 2^lo+1 or 2^lo-1. But since n has lo+1 bits, n can only divide 2^lo+1. More precisely, n must be 2^lo+1. */ { /* check that n equals 2^lo+1. Since n divides 2^(2lo)-1, n is odd. */ if (mpz_scan1 (n, 1UL) != lo) lo = 0; mpz_clear (w); mpz_clear (u); return lo; } k = mpz_sizeinbase (w, 2) - 1; /* if w = 2^k then n divides 2^(2*lo-k)-1 */ mpz_set_ui (u, 1UL); mpz_mul_2exp (u, u, k); if (mpz_cmp(w, u) == 0) res = k - 2 * lo; else /* if w = -2^k then n divides 2^(2*lo-k)+1 */ { mpz_neg (w, w); mpz_mod (w, w, n); k = mpz_sizeinbase (w, 2) - 1; mpz_set_ui (u, 1UL); mpz_mul_2exp (u, u, k); if (mpz_cmp (w, u) == 0) res = 2 * lo - k; } mpz_clear (u); mpz_clear (w); #if 0 if (res != 0) mpmod_tune_base2 (n, 1000000, res); #endif if (abs (res) > (int) (threshold * (double) lo)) res = 0; if (abs (res) < 16) res = 0; return res; } /* Do base-2 reduction. R must not equal S or t. */ static void base2mod (mpres_t R, const mpres_t S, mpres_t t, mpmod_t modulus) { unsigned long absbits = abs (modulus->bits); ASSERT (R != S && R != t); mpz_tdiv_q_2exp (R, S, absbits); mpz_tdiv_r_2exp (t, S, absbits); if (modulus->bits < 0) mpz_add (R, R, t); else mpz_sub (R, t, R); /* mpz_mod (R, R, modulus->orig_modulus); */ while (mpz_sizeinbase (R, 2) > absbits) { mpz_tdiv_q_2exp (t, R, absbits); mpz_tdiv_r_2exp (R, R, absbits); if (modulus->bits < 0) mpz_add (R, R, t); else mpz_sub (R, R, t); } } /* Modular reduction modulo the Fermat number 2^m+1. n = m / GMP_NUMB_BITS. Result is < 2^m+1. FIXME: this does not work with nails. Only copies the data to R if reduction is needed and returns 1 in that case. If the value in S is reduced already, nothing is done and 0 is returned. Yes, this is ugly. */ static int base2mod_2 (mpres_t R, const mpres_t S, mp_size_t n, mpz_t modulus) { mp_size_t s; s = ABSIZ(S); if (s > n) { if (s == n + 1) { mp_srcptr sp = PTR(S); mp_ptr rp; MPZ_REALLOC (R, s); rp = PTR(R); if ((rp[n] = mpn_sub_1 (rp, sp, n, sp[n]))) rp[n] = mpn_add_1 (rp, rp, n, rp[n]); MPN_NORMALIZE(rp, s); ASSERT (s <= n || (s == n && rp[n] == 1)); SIZ(R) = (SIZ(S) > 0) ? (int) s : (int) -s; } else /* should happen rarely */ mpz_mod (R, S, modulus); return 1; } return 0; } /* subquadratic REDC, at mpn level. {orig,n} is the original modulus. Requires xn = 2n or 2n-1 and ABSIZ(orig_modulus)=n. */ static void ecm_redc_n (mp_ptr rp, mp_srcptr x0p, mp_size_t xn, mp_srcptr orig, mp_srcptr invm, mp_size_t n) { mp_ptr tp, up, xp; mp_size_t nn = n + n; mp_limb_t cy, cin; TMP_DECL(marker); ASSERT((xn == 2 * n) || (xn == 2 * n - 1)); TMP_MARK(marker); up = TMP_ALLOC_LIMBS(nn + nn); if (xn < nn) { xp = TMP_ALLOC_LIMBS(nn); MPN_COPY (xp, x0p, xn); xp[nn - 1] = 0; } else xp = (mp_ptr) x0p; #ifdef HAVE___GMPN_MULLO_N /* available up from GMP 5.0.0 */ __gmpn_mullo_n (up, xp, invm, n); #else ecm_mul_lo_n (up, xp, invm, n); #endif tp = up + nn; mpn_mul_n (tp, up, orig, n); /* add {x, 2n} and {tp, 2n}. We know that {tp, n} + {xp, n} will give either 0, or a carry out. If xp[n-1] <> 0 or tp[n-1] <> 0, then there is a carry. We use a binary OR, which sets the zero flag if and only if both operands are zero. */ cin = (mp_limb_t) ((xp[n - 1] | tp[n - 1]) ? 1 : 0); #ifdef HAVE___GMPN_ADD_NC cy = __gmpn_add_nc (rp, tp + n, xp + n, n, cin); #else cy = mpn_add_n (rp, tp + n, xp + n, n); cy += mpn_add_1 (rp, rp, n, cin); #endif /* since we add at most N-1 to the upper half of {x0p,2n}, one adjustment is enough */ if (cy) cy -= mpn_sub_n (rp, rp, orig, n); ASSERT (cy == 0); TMP_FREE(marker); } /* REDC. x and t must not be identical, t has limb growth */ /* subquadratic REDC, at mpz level */ static void REDC (mpres_t r, const mpres_t x, mpz_t t, mpmod_t modulus) { mp_size_t n = modulus->bits / GMP_NUMB_BITS; mp_size_t xn = ABSIZ(x); ASSERT (xn <= 2 * n); if (xn == 2 * n) /* ecm_redc_n also accepts xn=2n-1, but this seems slower for now (see remark in TODO) */ { mp_ptr rp; MPZ_REALLOC (r, n); rp = PTR(r); ecm_redc_n (rp, PTR(x), xn, PTR(modulus->orig_modulus), PTR(modulus->aux_modulus), n); MPN_NORMALIZE(rp, n); SIZ(r) = (SIZ(x) > 0) ? (int) n : (int) -n; MPZ_NORMALIZED (r); } else { mpz_tdiv_r_2exp (t, x, modulus->bits); mpz_mul (t, t, modulus->aux_modulus); mpz_tdiv_r_2exp (t, t, modulus->bits); /* t = (x % R) * 1/N (mod R) */ mpz_mul (t, t, modulus->orig_modulus); mpz_add (t, t, x); mpz_tdiv_q_2exp (r, t, modulus->bits); /* r = (x + m*N) / R */ if (ABSIZ (r) > n) mpz_sub (r, r, modulus->multiple); } ASSERT (ABSIZ(r) <= n); } /* Quadratic time redc for n word moduli. */ static inline void redc_basecase_n (mp_ptr rp, mp_ptr cp, mp_srcptr np, const mp_size_t nn, const mp_ptr invm) { #ifdef HAVE___GMPN_REDC_2 REDC2(rp, cp, np, nn, invm); #else /* HAVE___GMPN_REDC_2 is not defined */ #ifdef HAVE___GMPN_REDC_1 REDC1(rp, cp, np, nn, invm[0]); #else /* neither HAVE___GMPN_REDC_2 nor HAVE___GMPN_REDC_1 is defined */ mp_limb_t cy; mp_size_t j; for (j = 0; j < nn; j++) { cy = mpn_addmul_1 (cp, np, nn, cp[0] * invm[0]); ASSERT(cp[0] == (mp_limb_t) 0); cp[0] = cy; cp++; } /* add vector of carries and shift */ cy = mpn_add_n (rp, cp, cp - nn, nn); /* the result of Montgomery's REDC is less than 2^Nbits + N, thus at most one correction is enough */ if (cy != 0) { mp_limb_t t; t = mpn_sub_n (rp, rp, np, nn); /* a borrow should always occur here */ ASSERT (t == 1); } #endif /* HAVE___GMPN_REDC_1 */ #endif /* HAVE___GMPN_REDC_2 */ } /* r <- c/R^nn mod n, where n has nn limbs, and R=2^GMP_NUMB_BITS. n must be odd. c must have space for at least 2*nn limbs. r must have space for at least n limbs. c and r can be the same variable. The data in c is clobbered. */ static void ecm_redc_basecase (mpz_ptr r, mpz_ptr c, mpmod_t modulus) { mp_ptr rp; mp_ptr cp; mp_srcptr np; mp_size_t j, nn = modulus->bits / GMP_NUMB_BITS; ASSERT(ABSIZ(c) <= 2 * nn); ASSERT(ALLOC(c) >= 2 * nn); ASSERT(ALLOC(r) >= nn); cp = PTR(c); rp = PTR(r); np = PTR(modulus->orig_modulus); for (j = ABSIZ(c); j < 2 * nn; j++) cp[j] = 0; redc_basecase_n (rp, cp, np, nn, modulus->Nprim); MPN_NORMALIZE (rp, nn); SIZ(r) = SIZ(c) < 0 ? (int) -nn : (int) nn; } #ifdef USE_ASM_REDC /* Quadratic time multiplication and REDC with nn-limb modulus. x and y are nn-limb residues, the nn-limb result is written to z. This function merely calls the correct mulredc*() assembly function depending on nn, and processes any leftover carry. */ static void mulredc (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_srcptr m, const mp_size_t nn, const mp_limb_t invm) { mp_limb_t cy; switch (nn) { case 1: cy = mulredc1(z, x[0], y[0], m[0], invm); break; case 2: cy = mulredc2(z, x, y, m, invm); break; case 3: cy = mulredc3(z, x, y, m, invm); break; case 4: cy = mulredc4(z, x, y, m, invm); break; case 5: cy = mulredc5(z, x, y, m, invm); break; case 6: cy = mulredc6(z, x, y, m, invm); break; case 7: cy = mulredc7(z, x, y, m, invm); break; case 8: cy = mulredc8(z, x, y, m, invm); break; case 9: cy = mulredc9(z, x, y, m, invm); break; case 10: cy = mulredc10(z, x, y, m, invm); break; case 11: cy = mulredc11(z, x, y, m, invm); break; case 12: cy = mulredc12(z, x, y, m, invm); break; case 13: cy = mulredc13(z, x, y, m, invm); break; case 14: cy = mulredc14(z, x, y, m, invm); break; case 15: cy = mulredc15(z, x, y, m, invm); break; case 16: cy = mulredc16(z, x, y, m, invm); break; case 17: cy = mulredc17(z, x, y, m, invm); break; case 18: cy = mulredc18(z, x, y, m, invm); break; case 19: cy = mulredc19(z, x, y, m, invm); break; case 20: cy = mulredc20(z, x, y, m, invm); break; default: abort(); } /* the result of Montgomery's REDC is less than 2^Nbits + N, thus at most one correction is enough */ if (cy != 0) { ATTRIBUTE_UNUSED mp_limb_t t; t = mpn_sub_n (z, z, m, nn); /* a borrow should always occur here */ ASSERT (t == 1); } } /* {rp, n} <- {ap, n}^2/B^n mod {np, n} where B = 2^GMP_NUMB_BITS */ ATTRIBUTE_UNUSED static void sqrredc (mp_ptr rp, mp_srcptr ap, mp_srcptr np, const mp_size_t n, const mp_limb_t invm) { mp_ptr cp; mp_size_t i; mp_limb_t cy, q; TMP_DECL(marker); TMP_MARK(marker); cp = TMP_ALLOC_LIMBS(2*n); for (i = 0; i < n; i++) umul_ppmm (cp[2*i+1], cp[2*i], ap[i], ap[i]); if (UNLIKELY(n == 1)) { q = cp[0] * invm; rp[0] = mpn_addmul_1 (cp, np, 1, q); cy = mpn_add_n (rp, rp, cp + 1, 1); goto end_sqrredc; } if (cp[0] & (mp_limb_t) 1) /* cp[n] is either some ap[i]^2 mod B or floor(ap[i]^2/B), the latter is at most floor((B-1)^2/B) = B-2, and the former cannot be B-1 since -1 is not a square mod 2^n for n >1, thus there is no carry in cp[n] + ... below */ cp[n] += mpn_add_n (cp, cp, np, n); /* now {cp, 2n} is even: divide by two */ mpn_rshift (cp, cp, 2*n, 1); /* now cp[2n-1] is at most B/2-1 */ for (i = 0; i < n - 1; i++) { q = cp[i] * invm; cp[i] = mpn_addmul_1 (cp + i, np, n, q); /* accumulate ap[i+1..n-1] * ap[i] */ rp[i] = mpn_addmul_1 (cp + 2 * i + 1, ap + i + 1, n - 1 - i, ap[i]); } /* the last iteration did set cp[n-2] to zero, accumulated a[n-1] * a[n-2] */ /* cp[2n-1] was untouched so far, so it is still at most B/2-1 */ q = cp[n-1] * invm; rp[n-1] = mpn_addmul_1 (cp + n - 1, np, n, q); /* rp[n-1] <= floor((B^n-1)*(B-1)/B^n)<=B-2 */ /* now add {rp, n}, {cp+n, n} and {cp, n-1} */ /* cp[2n-1] still <= B/2-1 */ rp[n-1] += mpn_add_n (rp, rp, cp, n-1); /* no overflow in rp[n-1] + ... */ cy = mpn_add_n (rp, rp, cp + n, n); /* multiply by 2 */ cy = (cy << 1) + mpn_lshift (rp, rp, n, 1); end_sqrredc: while (cy) cy -= mpn_sub_n (rp, rp, np, n); TMP_FREE(marker); } #ifdef HAVE_NATIVE_MULREDC1_N /* Multiplies y by the 1-limb value of x and does modulo reduction. The resulting residue may be multiplied by some constant, which makes this function useful only for cases where, e.g., all projective coordinates are multiplied by the same constant. More precisely it computes: {z, N} = {y, N} * x / 2^GMP_NUMB_BITS mod {m, N} */ static void mulredc_1 (mp_ptr z, const mp_limb_t x, mp_srcptr y, mp_srcptr m, const mp_size_t N, const mp_limb_t invm) { mp_limb_t cy; switch (N) { case 1: cy = mulredc1(z, x, y[0], m[0], invm); break; case 2: cy = mulredc1_2(z, x, y, m, invm); break; case 3: cy = mulredc1_3(z, x, y, m, invm); break; case 4: cy = mulredc1_4(z, x, y, m, invm); break; case 5: cy = mulredc1_5(z, x, y, m, invm); break; case 6: cy = mulredc1_6(z, x, y, m, invm); break; case 7: cy = mulredc1_7(z, x, y, m, invm); break; case 8: cy = mulredc1_8(z, x, y, m, invm); break; case 9: cy = mulredc1_9(z, x, y, m, invm); break; case 10: cy = mulredc1_10(z, x, y, m, invm); break; case 11: cy = mulredc1_11(z, x, y, m, invm); break; case 12: cy = mulredc1_12(z, x, y, m, invm); break; case 13: cy = mulredc1_13(z, x, y, m, invm); break; case 14: cy = mulredc1_14(z, x, y, m, invm); break; case 15: cy = mulredc1_15(z, x, y, m, invm); break; case 16: cy = mulredc1_16(z, x, y, m, invm); break; case 17: cy = mulredc1_17(z, x, y, m, invm); break; case 18: cy = mulredc1_18(z, x, y, m, invm); break; case 19: cy = mulredc1_19(z, x, y, m, invm); break; case 20: cy = mulredc1_20(z, x, y, m, invm); break; default: { abort (); } } /* the result of Montgomery's REDC is less than 2^Nbits + N, thus one correction (at most) is enough */ if (cy != 0) { ATTRIBUTE_UNUSED mp_limb_t t; t = mpn_sub_n (z, z, m, N); /* a borrow should always occur here */ ASSERT (t == 1); } } #endif /* ifdef HAVE_NATIVE_MULREDC1_N */ #endif #ifndef TUNE_MULREDC_TABLE #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} #endif #ifndef TUNE_SQRREDC_TABLE #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} #endif static int tune_mulredc_table[] = TUNE_MULREDC_TABLE; static int tune_sqrredc_table[] = TUNE_SQRREDC_TABLE; static void ecm_mulredc_basecase_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr s2p, mp_srcptr np, mp_size_t nn, mp_ptr invm, mp_ptr tmp) { mp_limb_t cy; mp_size_t j; if (nn <= MULREDC_ASSEMBLY_MAX) { switch (tune_mulredc_table[nn]) { case MPMOD_MULREDC: /* use quadratic assembly mulredc */ #ifdef USE_ASM_REDC mulredc (rp, s1p, s2p, np, nn, invm[0]); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC1: /* mpn_mul_n + __gmpn_redc_1 */ #ifdef HAVE___GMPN_REDC_1 mpn_mul_n (tmp, s1p, s2p, nn); REDC1(rp, tmp, np, nn, invm[0]); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC2: /* mpn_mul_n + __gmpn_redc_2 */ #ifdef HAVE___GMPN_REDC_2 mpn_mul_n (tmp, s1p, s2p, nn); REDC2(rp, tmp, np, nn, invm); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDCN: /* mpn_mul_n + __gmpn_redc_n */ #ifdef HAVE___GMPN_REDC_N mpn_mul_n (tmp, s1p, s2p, nn); __gmpn_redc_n (rp, tmp, np, nn, invm); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC_C: /* plain C quadratic reduction */ mpn_mul_n (tmp, s1p, s2p, nn); for (j = 0; j < nn; j++, tmp++) tmp[0] = mpn_addmul_1 (tmp, np, nn, tmp[0] * invm[0]); cy = mpn_add_n (rp, tmp - nn, tmp, nn); if (cy != 0) mpn_sub_n (rp, rp, np, nn); /* a borrow should always occur here */ break; default: { outputf (OUTPUT_ERROR, "Invalid mulredc mode: %d\n", tune_mulredc_table[nn]); exit (EXIT_FAILURE); } } } else /* nn > MULREDC_ASSEMBLY_MAX */ { mpn_mul_n (tmp, s1p, s2p, nn); ecm_redc_n (rp, tmp, 2 * nn, np, invm, nn); } } static void ecm_sqrredc_basecase_n (mp_ptr rp, mp_srcptr s1p, mp_srcptr np, mp_size_t nn, mp_ptr invm, mp_ptr tmp) { mp_limb_t cy; mp_size_t j; if (nn <= MULREDC_ASSEMBLY_MAX) { switch (tune_sqrredc_table[nn]) { case MPMOD_MULREDC: /* use quadratic assembly mulredc */ #ifdef USE_ASM_REDC mulredc (rp, s1p, s1p, np, nn, invm[0]); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC1: /* mpn_sqr + __gmpn_redc_1 */ #ifdef HAVE___GMPN_REDC_1 mpn_sqr (tmp, s1p, nn); REDC1(rp, tmp, np, nn, invm[0]); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC2: /* mpn_sqr + __gmpn_redc_2 */ #ifdef HAVE___GMPN_REDC_2 mpn_sqr (tmp, s1p, nn); REDC2(rp, tmp, np, nn, invm); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDCN: /* mpn_sqr + __gmpn_redc_n */ #ifdef HAVE___GMPN_REDC_N mpn_sqr (tmp, s1p, nn); __gmpn_redc_n (rp, tmp, np, nn, invm); break; #endif /* otherwise go through to the next available mode */ case MPMOD_MUL_REDC_C: /* plain C quadratic reduction */ mpn_sqr (tmp, s1p, nn); for (j = 0; j < nn; j++, tmp++) tmp[0] = mpn_addmul_1 (tmp, np, nn, tmp[0] * invm[0]); cy = mpn_add_n (rp, tmp - nn, tmp, nn); if (cy != 0) mpn_sub_n (rp, rp, np, nn); /* a borrow should always occur here */ break; default: { outputf (OUTPUT_ERROR, "Invalid sqrredc mode: %d\n", tune_sqrredc_table[nn]); exit (EXIT_FAILURE); } } } else /* nn > MULREDC_ASSEMBLY_MAX */ { mpn_sqr (tmp, s1p, nn); ecm_redc_n (rp, tmp, 2 * nn, np, invm, nn); } } /* R <- S1 * S2 mod modulus i.e. R <- S1*S2/r^nn mod n, where n has nn limbs, and r=2^GMP_NUMB_BITS. Same as ecm_redc_basecase previous, but combined with mul Neither input argument must be in modulus->temp1 */ static void ecm_mulredc_basecase (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mp_ptr s1p, s2p, rp = PTR(R); mp_size_t j, nn = modulus->bits / GMP_NUMB_BITS; ASSERT(ALLOC(R) >= nn); ASSERT(ALLOC(S1) >= nn); ASSERT(ALLOC(S2) >= nn); s1p = PTR(S1); s2p = PTR(S2); /* FIXME: S1 and S2 are input and marked const, we mustn't write to them */ for (j = ABSIZ(S1); j < nn; j++) s1p[j] = 0; for (j = ABSIZ(S2); j < nn; j++) s2p[j] = 0; ecm_mulredc_basecase_n (rp, s1p, s2p, PTR(modulus->orig_modulus), nn, modulus->Nprim, PTR(modulus->temp1)); MPN_NORMALIZE (rp, nn); SIZ(R) = (SIZ(S1)*SIZ(S2)) < 0 ? (int) -nn : (int) nn; } /* R <- S1^2 mod modulus i.e. R <- S1^2/r^nn mod n, where n has nn limbs, and r=2^GMP_NUMB_BITS. Same as ecm_redc_basecase previous, but combined with sqr The input argument must not be in modulus->temp1 */ static void ecm_sqrredc_basecase (mpres_t R, const mpres_t S1, mpmod_t modulus) { mp_ptr rp; mp_ptr s1p; mp_size_t j, nn = modulus->bits / GMP_NUMB_BITS; ASSERT(ALLOC(R) >= nn); ASSERT(ALLOC(S1) >= nn); rp = PTR(R); s1p = PTR(S1); /* FIXME: S1 is input and marked const, we mustn't write to it */ for (j = ABSIZ(S1); j < nn; j++) s1p[j] = 0; ecm_sqrredc_basecase_n (rp, s1p, PTR(modulus->orig_modulus), nn, modulus->Nprim, PTR(modulus->temp1)); MPN_NORMALIZE (rp, nn); SIZ(R) = (int) nn; } /* Multiplies S1 by the one-limb integer S2, and does modulo reduction. The modulo reduction may imply multiplication of the residue class by some constant, since we may not do the correct number of REDC reduction passes and so fail to divide by the correct power of 2 for Montgomery representation. The constant is the same for each call of this function with a given modulus, however. */ static void ecm_mulredc_1_basecase (mpres_t R, const mpres_t S1, const mp_limb_t S2, mpmod_t modulus) { mp_ptr s1p; mp_size_t j, nn = modulus->bits / GMP_NUMB_BITS; ASSERT(ALLOC(R) >= nn); ASSERT(ALLOC(S1) >= nn); s1p = PTR(S1); for (j = ABSIZ(S1); j < nn; j++) s1p[j] = 0; #ifdef HAVE_NATIVE_MULREDC1_N if (nn < 20) { mp_ptr rp = PTR(R); mulredc_1(rp, S2, s1p, PTR(modulus->orig_modulus), nn, modulus->Nprim[0]); MPN_NORMALIZE (rp, nn); SIZ(R) = (SIZ(S1)) < 0 ? (int) -nn : (int) nn; } else #endif { /* FIXME, we can do much better than this */ mpz_mul_ui (modulus->temp1, S1, S2); mpz_mod(R, modulus->temp1, modulus->orig_modulus); } } /* If the user asked for a particular representation, always use it. If repr = ECM_MOD_DEFAULT, use the thresholds. Don't use base2 if repr = ECM_MOD_NOBASE2. If a value is <= -16 or >= 16, it is a base2 exponent. Return a non-zero value if an error occurred. */ int mpmod_init (mpmod_t modulus, const mpz_t N, int repr) { int base2 = 0, r = 0; mp_size_t n = mpz_size (N); switch (repr) { case ECM_MOD_DEFAULT: if ((base2 = isbase2 (N, BASE2_THRESHOLD))) { repr = ECM_MOD_BASE2; break; } /* else go through */ case ECM_MOD_NOBASE2: if (mpz_size (N) < MPZMOD_THRESHOLD) repr = ECM_MOD_MODMULN; else if (mpz_size (N) < REDC_THRESHOLD) repr = ECM_MOD_MPZ; else repr = ECM_MOD_REDC; } /* now repr is {ECM_MOD_BASE2, ECM_MOD_MODMULN, ECM_MOD_MPZ, ECM_MOD_REDC}, or |repr| >= 16. */ switch (repr) { case ECM_MOD_MPZ: outputf (OUTPUT_VERBOSE, "Using mpz_mod\n"); mpmod_init_MPZ (modulus, N); break; case ECM_MOD_MODMULN: outputf (OUTPUT_VERBOSE, "Using MODMULN [mulredc:%d, sqrredc:%d]\n", (n <= MULREDC_ASSEMBLY_MAX) ? tune_mulredc_table[n] : 4, (n <= MULREDC_ASSEMBLY_MAX) ? tune_sqrredc_table[n] : 4); mpmod_init_MODMULN (modulus, N); break; case ECM_MOD_REDC: outputf (OUTPUT_VERBOSE, "Using REDC\n"); mpmod_init_REDC (modulus, N); break; default: /* base2 case: either repr=ECM_MOD_BASE2, and base2 was determined above, or |repr| >= 16, and we want base2 = repr */ if (repr != ECM_MOD_BASE2) base2 = repr; r = mpmod_init_BASE2 (modulus, base2, N); ASSERT (r == 0); /* error should not happen if isbase2 is correct */ break; } return r; } void mpres_clear (mpres_t a, ATTRIBUTE_UNUSED const mpmod_t modulus) { mpz_clear (a); PTR(a) = NULL; /* Make sure we segfault if we access it again */ } void mpmod_init_MPZ (mpmod_t modulus, const mpz_t N) { size_t n; mpz_init_set (modulus->orig_modulus, N); modulus->repr = ECM_MOD_MPZ; n = mpz_size (N); /* number of limbs of N */ modulus->bits = n * GMP_NUMB_BITS; /* Number of bits, rounded up to full limb */ MPZ_INIT2 (modulus->temp1, 2UL * modulus->bits + GMP_NUMB_BITS); MPZ_INIT2 (modulus->temp2, modulus->bits); MPZ_INIT2 (modulus->aux_modulus, modulus->bits); mpz_set_ui (modulus->aux_modulus, 1UL); /* we precompute B^(n + ceil(n/2)) mod N, where B=2^GMP_NUMB_BITS */ mpz_mul_2exp (modulus->aux_modulus, modulus->aux_modulus, (n + (n + 1) / 2) * GMP_NUMB_BITS); mpz_mod (modulus->aux_modulus, modulus->aux_modulus, N); return; } int mpmod_init_BASE2 (mpmod_t modulus, const int base2, const mpz_t N) { int Nbits; outputf (OUTPUT_VERBOSE, "Using special division for factor of 2^%d%c1\n", abs (base2), (base2 < 0) ? '-' : '+'); mpz_init_set (modulus->orig_modulus, N); modulus->repr = ECM_MOD_BASE2; modulus->bits = base2; Nbits = mpz_size (N) * GMP_NUMB_BITS; /* Number of bits, rounded up to full limb */ MPZ_INIT2 (modulus->temp1, 2UL * Nbits + GMP_NUMB_BITS); MPZ_INIT2 (modulus->temp2, Nbits); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, abs (base2)); if (base2 < 0) mpz_sub_ui (modulus->temp1, modulus->temp1, 1UL); else mpz_add_ui (modulus->temp1, modulus->temp1, 1UL); if (!mpz_divisible_p (modulus->temp1, N)) { outputf (OUTPUT_ERROR, "mpmod_init_BASE2: n does not divide 2^%d%c1\n", abs (base2), base2 < 0 ? '-' : '+'); mpz_clear (modulus->temp2); mpz_clear (modulus->temp1); mpz_clear (modulus->orig_modulus); return ECM_ERROR; } modulus->Fermat = 0; if (base2 > 0) { unsigned long i; for (i = base2; (i & 1) == 0; i >>= 1); if (i == 1) { modulus->Fermat = base2; } } return 0; } /* initialize the following fields: orig_modulus - the original modulus bits - # of bits of N, rounded up to a multiple of GMP_NUMB_BITS temp1, temp2 - auxiliary variables Nprim - -1/N mod B^n where B=2^GMP_NUMB_BITS and n = #limbs(N) R2 - (2^bits)^2 (mod N) R3 - (2^bits)^3 (mod N) multiple - smallest multiple of N >= 2^bits */ void mpmod_init_MODMULN (mpmod_t modulus, const mpz_t N) { int Nbits; MEMORY_TAG; mpz_init_set (modulus->orig_modulus, N); MEMORY_UNTAG; modulus->repr = ECM_MOD_MODMULN; Nbits = mpz_size (N) * GMP_NUMB_BITS; /* Number of bits, rounded up to full limb */ modulus->bits = Nbits; MPZ_INIT2 (modulus->temp1, 2UL * Nbits + GMP_NUMB_BITS); MPZ_INIT2 (modulus->temp2, Nbits + 1); modulus->Nprim = (mp_limb_t*) malloc (mpz_size (N) * sizeof (mp_limb_t)); MPZ_INIT2 (modulus->R2, Nbits); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, 2 * Nbits); mpz_mod (modulus->R2, modulus->temp1, modulus->orig_modulus); /* Now R2 = (2^bits)^2 (mod N) */ MPZ_INIT2 (modulus->R3, Nbits); mpz_mul_2exp (modulus->temp1, modulus->R2, Nbits); mpz_mod (modulus->R3, modulus->temp1, modulus->orig_modulus); /* Now R3 = (2^bits)^3 (mod N) */ MPZ_INIT2 (modulus->multiple, Nbits); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, Nbits); /* compute ceil(2^bits / N) */ mpz_cdiv_q (modulus->temp1, modulus->temp1, modulus->orig_modulus); mpz_mul (modulus->multiple, modulus->temp1, modulus->orig_modulus); /* Now multiple is the smallest multiple of N >= 2^bits */ mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, Nbits); /* since we directly check even modulus in ecm/pm1/pp1, N is odd here, thus 1/N mod 2^Nbits always exist */ mpz_invert (modulus->temp2, N, modulus->temp1); /* temp2 = 1/N mod B^n */ mpz_sub (modulus->temp2, modulus->temp1, modulus->temp2); /* temp2 = -1/N mod B^n */ /* ensure Nprim has all its n limbs correctly set, for ecm_redc_n */ MPN_ZERO(modulus->Nprim, mpz_size (N)); mpn_copyi (modulus->Nprim, PTR(modulus->temp2), ABSIZ(modulus->temp2)); } void mpmod_init_REDC (mpmod_t modulus, const mpz_t N) { mp_size_t n; int Nbits; mpz_init_set (modulus->orig_modulus, N); n = mpz_size (N); modulus->repr = ECM_MOD_REDC; Nbits = n * GMP_NUMB_BITS; /* Number of bits, rounded up to full limb */ modulus->bits = Nbits; MPZ_INIT2 (modulus->temp1, 2 * Nbits + GMP_NUMB_BITS); MPZ_INIT2 (modulus->temp2, Nbits); MPZ_INIT2 (modulus->aux_modulus, Nbits); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, Nbits); /* since we directly check even modulus in ecm/pm1/pp1, N is odd here, thus 1/N mod 2^Nbits always exist */ mpz_invert (modulus->aux_modulus, N, modulus->temp1); mpz_sub (modulus->aux_modulus, modulus->temp1, modulus->aux_modulus); /* ensure aux_modulus has n allocated limbs, for ecm_redc_n */ if (ABSIZ(modulus->aux_modulus) < n) { _mpz_realloc (modulus->aux_modulus, n); /* in case the reallocation fails, _mpz_realloc sets the value to 0 */ ASSERT_ALWAYS (mpz_cmp_ui (modulus->aux_modulus, 0) != 0); MPN_ZERO (PTR(modulus->aux_modulus) + ABSIZ(modulus->aux_modulus), n - ABSIZ(modulus->aux_modulus)); } MPZ_INIT2 (modulus->R2, Nbits); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, 2 * Nbits); mpz_mod (modulus->R2, modulus->temp1, modulus->orig_modulus); /* Now R2 = (2^bits)^2 (mod N) */ MPZ_INIT2 (modulus->R3, Nbits); mpz_mul_2exp (modulus->temp1, modulus->R2, Nbits); mpz_mod (modulus->R3, modulus->temp1, modulus->orig_modulus); /* Now R3 = (2^bits)^3 (mod N) */ MPZ_INIT (modulus->multiple); mpz_set_ui (modulus->temp1, 1UL); mpz_mul_2exp (modulus->temp1, modulus->temp1, Nbits); /* compute ceil(2^bits / N) */ mpz_cdiv_q (modulus->temp1, modulus->temp1, modulus->orig_modulus); mpz_mul (modulus->multiple, modulus->temp1, modulus->orig_modulus); /* Now multiple is the largest multiple of N >= 2^bits */ } void mpmod_clear (mpmod_t modulus) { mpz_clear (modulus->orig_modulus); mpz_clear (modulus->temp1); mpz_clear (modulus->temp2); if (modulus->repr == ECM_MOD_REDC || modulus->repr == ECM_MOD_MPZ) mpz_clear (modulus->aux_modulus); if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_clear (modulus->R2); mpz_clear (modulus->R3); mpz_clear (modulus->multiple); } if (modulus->repr == ECM_MOD_MODMULN) free (modulus->Nprim); return; } /* initialize r and set all entries from those of modulus */ void mpmod_init_set (mpmod_t r, const mpmod_t modulus) { const unsigned long Nbits = abs(modulus->bits); const unsigned long n = mpz_size (modulus->orig_modulus); r->repr = modulus->repr; r->bits = modulus->bits; r->Fermat = modulus->Fermat; mpz_init_set (r->orig_modulus, modulus->orig_modulus); MPZ_INIT2 (r->temp1, 2 * Nbits + GMP_NUMB_BITS); MPZ_INIT2 (r->temp2, Nbits + GMP_NUMB_BITS); if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { MPZ_INIT2 (r->multiple, Nbits); MPZ_INIT2 (r->R2, Nbits); MPZ_INIT2 (r->R3, Nbits); mpz_set (r->multiple, modulus->multiple); mpz_set (r->R2, modulus->R2); mpz_set (r->R3, modulus->R3); } if (modulus->repr == ECM_MOD_REDC || modulus->repr == ECM_MOD_MPZ) { MPZ_INIT2 (r->aux_modulus, Nbits); mpz_set (r->aux_modulus, modulus->aux_modulus); } if (modulus->repr == ECM_MOD_MODMULN) { r->Nprim = (mp_limb_t*) malloc (n * sizeof (mp_limb_t)); mpn_copyi (r->Nprim, modulus->Nprim, n); } } void mpres_init (mpres_t R, const mpmod_t modulus) { /* use mpz_sizeinbase since modulus->bits may not be initialized yet */ mpz_init2 (R, mpz_sizeinbase (modulus->orig_modulus, 2) + GMP_NUMB_BITS); } /* realloc R so that it has at least the same number of limbs as modulus */ void mpres_realloc (mpres_t R, const mpmod_t modulus) { if (modulus->repr == ECM_MOD_MODMULN) MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); } /* Returns non-zero if the two residues are equal, and zero if they are not */ int mpres_equal (const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mpz_mod (modulus->temp1, S1, modulus->orig_modulus); mpz_mod (modulus->temp2, S2, modulus->orig_modulus); return (mpz_cmp (modulus->temp1, modulus->temp2) == 0); } /* R <- BASE^EXP mod modulus. Assume EXP >= 0. */ void mpres_pow (mpres_t R, const mpres_t BASE, const mpz_t EXP, mpmod_t modulus) { ASSERT_NORMALIZED (BASE); if (modulus->repr == ECM_MOD_MPZ) { mpz_powm (R, BASE, EXP, modulus->orig_modulus); } else if (modulus->repr == ECM_MOD_BASE2 || modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { size_t expidx; mp_limb_t bitmask, expbits; /* case EXP=0 */ if (mpz_sgn (EXP) == 0) { mpres_set_ui (R, 1UL, modulus); /* set result to 1 */ ASSERT_NORMALIZED (R); return; } ASSERT (mpz_size (EXP) > 0); /* probably redundant with _sgn() test */ expidx = mpz_size (EXP) - 1; /* point at most significant limb */ expbits = mpz_getlimbn (EXP, expidx); /* get most significant limb */ ASSERT (expbits != 0); /* Scan for the MSB in expbits */ bitmask = ((mp_limb_t) 1) << (GMP_NUMB_BITS - 1); for (; (bitmask & expbits) == 0; bitmask >>= 1); /* here the most significant limb with any set bits is in expbits, */ /* bitmask is set to mask in the msb of expbits */ mpz_set (modulus->temp2, BASE); bitmask >>= 1; while (1) { for ( ; bitmask != 0; bitmask >>= 1) { /* Set temp2 = temp2*temp2 */ if (modulus->repr == ECM_MOD_BASE2) { mpz_mul (modulus->temp1, modulus->temp2, modulus->temp2); base2mod (modulus->temp2 , modulus->temp1, modulus->temp1, modulus); } else if (modulus->repr == ECM_MOD_MODMULN) { ecm_mulredc_basecase (modulus->temp2, modulus->temp2, modulus->temp2, modulus); } else { mpz_mul (modulus->temp1, modulus->temp2, modulus->temp2); REDC (modulus->temp2, modulus->temp1, modulus->temp2, modulus); } /* If bit is 1, set temp2 = temp2 * BASE */ if (expbits & bitmask) { if (modulus->repr == ECM_MOD_BASE2) { mpz_mul (modulus->temp1, modulus->temp2, BASE); base2mod (modulus->temp2, modulus->temp1, modulus->temp1, modulus); } else if (modulus->repr == ECM_MOD_MODMULN) { ecm_mulredc_basecase (modulus->temp2, BASE, modulus->temp2, modulus); } else { mpz_mul (modulus->temp1, modulus->temp2, BASE); REDC (modulus->temp2, modulus->temp1, modulus->temp2, modulus); } } } if (expidx == 0) /* if we just processed the least */ break; /* significant limb, we are done */ expidx --; expbits = mpz_getlimbn (EXP, expidx); bitmask = (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); } mpz_set (R, modulus->temp2); /* mpz_getlimbn() ignores sign of argument, so we computed BASE^|EXP|. If EXP was negative, do a modular inverse */ if (mpz_sgn (EXP) < 0) { mpres_invert (R, R, modulus); } } /* if (modulus->repr == ECM_MOD_BASE2 || ... ) */ ASSERT_NORMALIZED (R); } /* Returns 1 if S == 0 (mod modulus), 0 otherwise */ int mpres_is_zero (const mpres_t S, mpmod_t modulus) { mpz_mod (modulus->temp1, S, modulus->orig_modulus); /* For all currently implemented representations, a zero residue has zero integer representation */ return (mpz_sgn (modulus->temp1) == 0) ? 1 : 0; } /* R <- BASE^EXP mod modulus */ void mpres_ui_pow (mpres_t R, const unsigned long BASE, const mpres_t EXP, mpmod_t modulus) { if (modulus->repr == ECM_MOD_MPZ) { mpz_set_ui (modulus->temp1, BASE); mpz_powm (R, modulus->temp1, EXP, modulus->orig_modulus); } else if (modulus->repr == ECM_MOD_BASE2 || modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { size_t expidx; mp_limb_t bitmask, expbits; expidx = mpz_size (EXP) -1; /* point at most significant limb */ expbits = mpz_getlimbn (EXP, expidx); /* get most significant limb */ bitmask = (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); /* case EXP=0 */ if (mpz_sgn (EXP) == 0) { mpres_set_ui (R, 1UL, modulus); /* set result to 1 */ ASSERT_NORMALIZED (R); return; } ASSERT (mpz_size (EXP) > 0); /* probably redundant with _sgn() test */ expidx = mpz_size (EXP) - 1; /* point at most significant limb */ expbits = mpz_getlimbn (EXP, expidx); /* get most significant limb */ ASSERT (expbits != 0); /* Scan for the MSB in expbits */ bitmask = ((mp_limb_t) 1) << (GMP_NUMB_BITS - 1); for (; (bitmask & expbits) == 0; bitmask >>= 1); /* here the most significant limb with any set bits is in expbits, */ /* bitmask is set to mask in the msb of expbits */ mpz_set_ui (modulus->temp2, BASE); /* temp2 = BASE */ if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_mul_2exp (modulus->temp1, modulus->temp2, modulus->bits); mpz_mod (modulus->temp2, modulus->temp1, modulus->orig_modulus); } bitmask >>= 1; while (1) { for ( ; bitmask != 0; bitmask >>= 1) { /* Set temp2 = temp2*temp2 */ if (modulus->repr == ECM_MOD_BASE2) { mpz_mul (modulus->temp1, modulus->temp2, modulus->temp2); base2mod (modulus->temp2 , modulus->temp1, modulus->temp1, modulus); } else if (modulus->repr == ECM_MOD_MODMULN) { ecm_mulredc_basecase (modulus->temp2, modulus->temp2, modulus->temp2, modulus); } else { mpz_mul (modulus->temp1, modulus->temp2, modulus->temp2); REDC (modulus->temp2, modulus->temp1, modulus->temp2, modulus); } /* If bit is 1, set temp2 = temp2 * BASE */ if (expbits & bitmask) { if (BASE == 2UL) { mpz_mul_2exp (modulus->temp2, modulus->temp2, 1); if (mpz_cmp (modulus->temp2, modulus->orig_modulus) >= 0) mpz_sub (modulus->temp2, modulus->temp2, modulus->orig_modulus); } else { mpz_mul_ui (modulus->temp1, modulus->temp2, BASE); mpz_mod (modulus->temp2, modulus->temp1, modulus->orig_modulus); } } } if (expidx == 0) /* if we just processed the least */ break; /* significant limb, we are done */ expidx--; expbits = mpz_getlimbn (EXP, expidx); bitmask = (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); } mpz_set (R, modulus->temp2); /* mpz_getlimbn() ignores sign of argument, so we computed BASE^|EXP|. If EXP was negative, do a modular inverse */ if (mpz_sgn (EXP) < 0) { mpres_invert (R, R, modulus); } } /* if (modulus->repr == ECM_MOD_BASE2 || ... ) */ ASSERT_NORMALIZED (R); } /* We use here the algorithm described in "Fast Modular Reduction" from Hasenplaugh, Gaubatz and Gobal, Arith'18, 2007: assuming N has n limbs, we have precomputed C = B^(n + ceil(n/2)) mod N. */ static void mpres_mpz_mod (mpres_t R, mpz_t T, mpz_t N, mpz_t C) { size_t n = mpz_size (N); size_t t = mpz_size (T); size_t m = n + (n + 1) / 2; /* n + ceil(n/2) */ if (t > m && n > 1) /* if n=1, then m=2, thus h=0 */ { size_t c = mpz_size (C); size_t h, l; mp_ptr rp; mp_ptr tp = PTR(T); /* Warning: we might have t > 2n. In that case we reduce {tp+l+m, t-(m+l)} where l = t-2n. */ l = (t > 2 * n) ? t - 2 * n : 0; tp += l; h = t - (m + l); /* since t-l <= 2n and m = n + ceil(n/2), we have h <= n - ceil(n/2) = floor(n/2). On the other hand, if l=0 we have h = t-m > 0; if l>0, then l=t-2n, thus h=2n-m = floor(n/2) > 0 since n > 1. */ mpz_realloc (R, c + h); rp = PTR(R); if (c > h) mpn_mul (rp, PTR(C), c, tp + m, h); else mpn_mul (rp, tp + m, h, PTR(C), c); /* now add {rp, c+h} to {tp, m}: we have c <= n and h <= n/2, thus c + h <= m */ if (c + h > m) abort(); tp[m] = mpn_add (tp, tp, m, rp, c + h); m += l + tp[m]; tp -= l; /* put back the low l limbs */ MPN_NORMALIZE(tp, m); SIZ(T) = (SIZ(T) > 0) ? m : -m; } mpz_mod (R, T, N); } void mpres_mul (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { ASSERT_NORMALIZED (S1); ASSERT_NORMALIZED (S2); #ifdef WANT_ASSERT_EXPENSIVE mpz_t test1, test2, test_result1, test_result2; ASSERT_ALWAYS (S1 != modulus->temp1 && S2 != modulus->temp1 && R != modulus->temp1); mpz_init (test1); mpz_init (test2); mpz_init (test_result1); mpz_init (test_result2); mpres_get_z (test1, S1, modulus); mpres_get_z (test2, S2, modulus); mpz_mul (test_result1, test1, test2); mpz_mod (test_result1, test_result1, modulus->orig_modulus); #endif if (UNLIKELY(modulus->repr == ECM_MOD_BASE2 && modulus->Fermat >= 32768)) { mp_size_t n = modulus->Fermat / GMP_NUMB_BITS; unsigned long k; mp_srcptr s1p, s2p; mp_size_t s1s, s2s; MPZ_REALLOC (R, n + 1); s1p = PTR(S1); s1s = SIZ(S1); s2p = PTR(S2); s2s = SIZ(S2); k = mpn_fft_best_k (n, S1 == S2); ASSERT(mpn_fft_next_size (n, k) == n); if (base2mod_2 (modulus->temp1, S1, n, modulus->orig_modulus)) { s1p = PTR(modulus->temp1); s1s = SIZ(modulus->temp1); } if (S1 == S2) { s2p = s1p; s2s = s1s; } else if (base2mod_2 (modulus->temp2, S2, n, modulus->orig_modulus)) { s2p = PTR(modulus->temp2); s2s = SIZ(modulus->temp2); } /* mpn_mul_fft() computes the product modulo B^n + 1, where B = 2^(machine word size in bits). So the result can be = B^n, in that case R is set to zero and 1 is returned as carry-out. In all other cases 0 is returned. Hence the complete result is R + cy * B^n, where cy is the value returned by mpn_mul_fft(). */ PTR(R)[n] = mpn_mul_fft (PTR(R), n, s1p, ABS(s1s), s2p, ABS(s2s), k); n ++; MPN_NORMALIZE(PTR(R), n); SIZ(R) = ((s1s ^ s2s) >= 0) ? (int) n : (int) -n; return; } switch (modulus->repr) { case ECM_MOD_BASE2: mpz_mul (modulus->temp1, S1, S2); base2mod (R, modulus->temp1, modulus->temp1, modulus); break; case ECM_MOD_MODMULN: MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); ecm_mulredc_basecase (R, S1, S2, modulus); break; case ECM_MOD_REDC: mpz_mul (modulus->temp1, S1, S2); REDC (R, modulus->temp1, modulus->temp2, modulus); break; default: /* case ECM_MOD_MPZ */ mpz_mul (modulus->temp1, S1, S2); mpres_mpz_mod (R, modulus->temp1, modulus->orig_modulus, modulus->aux_modulus); break; } ASSERT_NORMALIZED (R); #ifdef WANT_ASSERT_EXPENSIVE mpres_get_z (test_result2, R, modulus); if (mpz_cmp (test_result1, test_result2) != 0) { printf ("mpres_mul and mpz_mul/mpz_mod produced different results.\n"); gmp_printf ("input 1: %Zd\n", test1); gmp_printf ("input 2: %Zd\n", test2); gmp_printf ("mpres_mul: %Zd\n", test_result2); gmp_printf ("mpz_mul/mpz_mod: %Zd\n", test_result1); abort (); } mpz_clear (test1); mpz_clear (test2); mpz_clear (test_result1); mpz_clear (test_result2); #endif } /* R <- S1^2 mod modulus */ void mpres_sqr (mpres_t R, const mpres_t S1, mpmod_t modulus) { ASSERT_NORMALIZED (S1); #ifdef WANT_ASSERT_EXPENSIVE mpz_t test1, test2, test_result1, test_result2; ASSERT_ALWAYS (S1 != modulus->temp1 && R != modulus->temp1); mpz_init (test1); mpz_init (test_result1); mpz_init (test_result2); mpres_get_z (test1, S1, modulus); mpz_mul (test_result1, test1, test1); mpz_mod (test_result1, test_result1, modulus->orig_modulus); #endif if (UNLIKELY(modulus->repr == ECM_MOD_BASE2 && modulus->Fermat >= 32768)) { mpres_mul (R, S1, S1, modulus); return; } switch (modulus->repr) { case ECM_MOD_BASE2: mpz_mul (modulus->temp1, S1, S1); base2mod (R, modulus->temp1, modulus->temp1, modulus); break; case ECM_MOD_MODMULN: MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); ecm_sqrredc_basecase (R, S1, modulus); break; case ECM_MOD_REDC: mpz_mul (modulus->temp1, S1, S1); REDC (R, modulus->temp1, modulus->temp2, modulus); break; default: /* case ECM_MOD_MPZ */ mpz_mul (modulus->temp1, S1, S1); mpres_mpz_mod (R, modulus->temp1, modulus->orig_modulus, modulus->aux_modulus); break; } ASSERT_NORMALIZED (R); #ifdef WANT_ASSERT_EXPENSIVE mpres_get_z (test_result2, R, modulus); if (mpz_cmp (test_result1, test_result2) != 0) { printf ("mpres_sqr and mpz_mul/mpz_mod produced different results.\n"); gmp_printf ("input 1: %Zd\n", test1); gmp_printf ("mpres_mul: %Zd\n", test_result2); gmp_printf ("mpz_mul/mpz_mod: %Zd\n", test_result1); abort (); } mpz_clear (test1); mpz_clear (test_result1); mpz_clear (test_result2); #endif } /* R <- S * n mod modulus */ void mpres_mul_ui (mpres_t R, const mpres_t S, const unsigned long n, mpmod_t modulus) { ASSERT_NORMALIZED (S); mpz_mul_ui (modulus->temp1, S, n); /* This is the same for all methods: just reduce with original modulus */ mpz_mod (R, modulus->temp1, modulus->orig_modulus); ASSERT_NORMALIZED (R); } /* R <- S * 2^k mod modulus */ void mpres_mul_2exp (mpres_t R, const mpres_t S, const unsigned long k, mpmod_t modulus) { ASSERT_NORMALIZED (S); mpz_mul_2exp (modulus->temp1, S, k); /* This is the same for all methods: just reduce with original modulus */ mpz_mod (R, modulus->temp1, modulus->orig_modulus); ASSERT_NORMALIZED (R); } /* Multiplies S by n and possibly divides by some constant. Whether or not it divides depends on the modulus representation and the modulus size. */ void mpres_muldivbysomething_si (mpres_t R, const mpres_t S, const long n, mpmod_t modulus) { ASSERT_NORMALIZED (S); if (modulus->repr == ECM_MOD_MODMULN && modulus->bits / GMP_NUMB_BITS <= 20) /* FIXME: is the 20 here the same constant as in mulredc1_20? If so, it should be changed into a macro. */ { MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); if (n < 0) { ecm_mulredc_1_basecase (R, S, (mp_limb_t) -n, modulus); mpres_neg (R, R, modulus); } else { ecm_mulredc_1_basecase (R, S, (mp_limb_t) n, modulus); } } else { mpz_mul_si (modulus->temp1, S, n); /* This is the same for all methods: just reduce with original modulus */ mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* This function multiplies an integer in mpres_t form with an integer in mpz_t form, and stores the output in mpz_t form. The advantage is that one REDC suffices to reduce the product and convert it to non-Montgomery representation. */ void mpres_mul_z_to_z (mpz_t R, const mpres_t S1, const mpz_t S2, mpmod_t modulus) { ASSERT_NORMALIZED (S1); if (modulus->repr == ECM_MOD_BASE2 && modulus->Fermat >= 32768) { mp_size_t n = modulus->Fermat / GMP_NUMB_BITS; unsigned long k; mp_srcptr s1p = PTR(S1), s2p = PTR(S2); mp_size_t s1s = SIZ(S1), s2s = SIZ(S2); MPZ_REALLOC (R, n + 1); k = mpn_fft_best_k (n, S1 == S2); ASSERT(mpn_fft_next_size (n, k) == n); if (base2mod_2 (modulus->temp1, S1, n, modulus->orig_modulus)) { s1p = PTR(modulus->temp1); s1s = SIZ(modulus->temp1); } if (S1 == S2) { s2p = s1p; s2s = s1s; } else if (base2mod_2 (modulus->temp2, S2, n, modulus->orig_modulus)) { s2p = PTR(modulus->temp2); s2s = SIZ(modulus->temp2); } /* mpn_mul_fft() computes the product modulo B^n + 1, where B = 2^(machine word size in bits). So the result can be = B^n, in that case R is set to zero and 1 is returned as carry-out. In all other cases 0 is returned. Hence the complete result is R + cy * B^n, where cy is the value returned by mpn_mul_fft(). */ PTR(R)[n] = mpn_mul_fft (PTR(R), n, s1p, ABS(s1s), s2p, ABS(s2s), k); n ++; MPN_NORMALIZE(PTR(R), n); SIZ(R) = ((s1s ^ s2s) >= 0) ? (int) n : (int) -n; mpz_mod (R, R, modulus->orig_modulus); return; } switch (modulus->repr) { case ECM_MOD_BASE2: if (mpz_sizeinbase (S2, 2) > (unsigned) abs (modulus->bits)) { base2mod (modulus->temp2, S2, modulus->temp1, modulus); mpz_mul (modulus->temp1, S1, modulus->temp2); } else mpz_mul (modulus->temp1, S1, S2); base2mod (R, modulus->temp1, modulus->temp1, modulus); mpz_mod (R, R, modulus->orig_modulus); break; case ECM_MOD_MODMULN: if (mpz_cmp (S2, modulus->orig_modulus) >= 0) { mpz_mod (modulus->temp2, S2, modulus->orig_modulus); MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); ecm_mulredc_basecase (R, S1, modulus->temp2, modulus); mpz_mod (R, R, modulus->orig_modulus); } else { MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); ecm_mulredc_basecase (R, S1, S2, modulus); mpz_mod (R, R, modulus->orig_modulus); } break; case ECM_MOD_REDC: if (mpz_cmp (S2, modulus->orig_modulus) >= 0) { mpz_mod (modulus->temp2, S2, modulus->orig_modulus); mpz_mul (modulus->temp1, S1, modulus->temp2); } else mpz_mul (modulus->temp1, S1, S2); REDC (R, modulus->temp1, modulus->temp2, modulus); mpz_mod (R, R, modulus->orig_modulus); break; default: if (mpz_cmp (S2, modulus->orig_modulus) >= 0) { mpz_mod (modulus->temp2, S2, modulus->orig_modulus); mpz_mul (modulus->temp1, S1, modulus->temp2); } else mpz_mul (modulus->temp1, S1, S2); mpz_mod (R, modulus->temp1, modulus->orig_modulus); break; } ASSERT_NORMALIZED (R); } /* Sets R = S * c, for some constant c that is coprime to modulus. This is primarily useful for multiplying numbers together for a gcd with modulus. The advantage is that we don't need to convert the mpz_t to Montgomery representation before applying REDC. */ void mpres_set_z_for_gcd (mpres_t R, const mpz_t S, mpmod_t modulus) { mpz_mod (R, S, modulus->orig_modulus); ASSERT_NORMALIZED (R); } /* R <- S / 2^n mod modulus. Does not need to be fast. */ void mpres_div_2exp (mpres_t R, const mpres_t S, const unsigned int n, mpmod_t modulus) { int i; ASSERT_NORMALIZED (S); if (n == 0) { mpres_set (R, S, modulus); ASSERT_NORMALIZED (R); return; } if (mpz_odd_p (S)) { ASSERT (mpz_odd_p (modulus->orig_modulus)); mpz_add (R, S, modulus->orig_modulus); mpz_tdiv_q_2exp (R, R, 1); } else mpz_tdiv_q_2exp (R, S, 1); for (i = n ; i > 1; i--) { if (mpz_odd_p (R)) { ASSERT (mpz_odd_p (modulus->orig_modulus)); mpz_add (R, R, modulus->orig_modulus); } mpz_tdiv_q_2exp (R, R, 1); } ASSERT_NORMALIZED (R); } void mpres_add_ui (mpres_t R, const mpres_t S, const unsigned long n, mpmod_t modulus) { ASSERT_NORMALIZED (S); if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_add_ui (R, S, n); if (mpz_cmp (R, modulus->orig_modulus) > 0) mpz_sub (R, R, modulus->orig_modulus); /* This assumes modulus >= n */ } else if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_set_ui (modulus->temp1, n); mpz_mul_2exp (modulus->temp1, modulus->temp1, modulus->bits); mpz_add (modulus->temp1, modulus->temp1, S); mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* R <- S1 + S2 mod modulus */ void mpres_add (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { ASSERT_NORMALIZED (S1); ASSERT_NORMALIZED (S2); mpz_add (R, S1, S2); if ((modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) && ABSIZ(R) > ABSIZ(modulus->orig_modulus)) { if (SIZ(R) > 0) mpz_sub (R, R, modulus->multiple); else mpz_add (R, R, modulus->multiple); /* N <= since multiple < 2^Nbits + N, now |R| < B */ } ASSERT_NORMALIZED (R); } /* R <- S - n mod modulus If repr == ECM_MOD_MODMULN or ECM_MOD_REDC, we need to convert n to Montgomery representation before substracting */ void mpres_sub_ui (mpres_t R, const mpres_t S, const unsigned long n, mpmod_t modulus) { ASSERT_NORMALIZED (S); if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_sub_ui (R, S, n); if (mpz_sgn (R) < 0) mpz_add (R, R, modulus->orig_modulus); /* Assumes modulus >= n */ } else if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_set_ui (modulus->temp1, n); mpz_mul_2exp (modulus->temp1, modulus->temp1, modulus->bits); mpz_sub (modulus->temp1, S, modulus->temp1); mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* R <- n - S mod modulus If repr == ECM_MOD_MODMULN or ECM_MOD_REDC, we need to convert n to Montgomery representation before substracting */ void mpres_ui_sub (mpres_t R, const unsigned long n ,const mpres_t S, mpmod_t modulus) { ASSERT_NORMALIZED (S); if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_ui_sub (R, n, S); if (mpz_sgn (R) < 0) mpz_add (R, R, modulus->orig_modulus); /* Assumes modulus >= n */ } else if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_set_ui (modulus->temp1, n); mpz_mul_2exp (modulus->temp1, modulus->temp1, modulus->bits); mpz_sub (modulus->temp1, modulus->temp1, S); mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* R <- S1 - S2 mod modulus */ void mpres_sub (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { ASSERT_NORMALIZED (S1); ASSERT_NORMALIZED (S2); mpz_sub (R, S1, S2); if ((modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) && ABSIZ(R) > ABSIZ(modulus->orig_modulus)) { if (SIZ(R) > 0) mpz_sub (R, R, modulus->multiple); else mpz_add (R, R, modulus->multiple); /* N <= since multiple < 2^Nbits + N, now |R| < B */ } ASSERT_NORMALIZED (R); } void mpres_set_z (mpres_t R, const mpz_t S, mpmod_t modulus) { if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) mpz_mod (R, S, modulus->orig_modulus); else if (modulus->repr == ECM_MOD_MODMULN) { mpz_mod (modulus->temp2, S, modulus->orig_modulus); ecm_mulredc_basecase (R, modulus->temp2, modulus->R2, modulus); } else if (modulus->repr == ECM_MOD_REDC) { mpz_mod (modulus->temp2, S, modulus->orig_modulus); mpz_mul (modulus->temp1, modulus->temp2, modulus->R2); REDC (R, modulus->temp1, modulus->temp2, modulus); } ASSERT_NORMALIZED (R); } /* R and S must not be modulus->temp1 */ void mpres_get_z (mpz_t R, const mpres_t S, mpmod_t modulus) { ASSERT_NORMALIZED (S); if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_mod (R, S, modulus->orig_modulus); } else if (modulus->repr == ECM_MOD_MODMULN) { mpz_set (modulus->temp1, S); MPZ_REALLOC (R, modulus->bits / GMP_NUMB_BITS); ecm_redc_basecase (R, modulus->temp1, modulus); mpz_mod (R, R, modulus->orig_modulus); /* FIXME: can we avoid this? */ } else if (modulus->repr == ECM_MOD_REDC) { REDC (R, S, modulus->temp1, modulus); mpz_mod (R, R, modulus->orig_modulus); /* FIXME: can we avoid this? */ } #ifdef DEBUG else { fprintf (ECM_STDERR, "mpres_get_z: Unexpected representation %d\n", modulus->repr); exit (EXIT_FAILURE); } #endif } /* R <- n mod modulus If repr==ECM_MOD_MPZ or ECM_MOD_BASE2, we convert n to Montgomery representation */ void mpres_set_ui (mpres_t R, const unsigned long n, mpmod_t modulus) { if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_set_ui (R, n); mpz_mod (R, R, modulus->orig_modulus); } else if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_set_ui (modulus->temp1, n); mpz_mul_2exp (modulus->temp1, modulus->temp1, modulus->bits); mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* same as previous but with signed long */ void mpres_set_si (mpres_t R, const long n, mpmod_t modulus) { if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_set_si (R, n); mpz_mod (R, R, modulus->orig_modulus); } else if (modulus->repr == ECM_MOD_MODMULN || modulus->repr == ECM_MOD_REDC) { mpz_set_si (modulus->temp1, n); mpz_mul_2exp (modulus->temp1, modulus->temp1, modulus->bits); mpz_mod (R, modulus->temp1, modulus->orig_modulus); } ASSERT_NORMALIZED (R); } /* R <- -S mod modulus. Does not need to be efficient. */ void mpres_neg (mpres_t R, const mpres_t S, ATTRIBUTE_UNUSED mpmod_t modulus) { ASSERT_NORMALIZED (S); mpz_neg (R, S); ASSERT_NORMALIZED (R); } /* Returns non-zero if inversion succeeded, and zero if not */ int mpres_invert (mpres_t R, const mpres_t S, mpmod_t modulus) { #ifdef WANT_ASSERT_EXPENSIVE mpres_t test; mpz_t test_result; mpres_init (test, modulus); mpres_set (test, S, modulus); #endif ASSERT_NORMALIZED (S); if (mpz_invert (modulus->temp2, S, modulus->orig_modulus) == 0) return 0; if (modulus->repr == ECM_MOD_MPZ || modulus->repr == ECM_MOD_BASE2) { mpz_set (R, modulus->temp2); ASSERT_NORMALIZED (R); } else if (modulus->repr == ECM_MOD_MODMULN) { ecm_mulredc_basecase (R, modulus->temp2, modulus->R3, modulus); ASSERT_NORMALIZED (R); } else if (modulus->repr == ECM_MOD_REDC) { MPZ_NORMALIZED (S); mpz_mul (modulus->temp1, modulus->temp2, modulus->R3); REDC (R, modulus->temp1, modulus->temp2, modulus); ASSERT_NORMALIZED (R); } #ifdef DEBUG else { fprintf (ECM_STDERR, "mpres_invert: Unexpected representation %d\n", modulus->repr); exit (EXIT_FAILURE); } #endif #ifdef WANT_ASSERT_EXPENSIVE mpres_mul (test, test, R, modulus); mpz_init (test_result); mpres_get_z (test_result, test, modulus); ASSERT_ALWAYS(mpz_cmp_ui (test_result, 1UL) == 0); mpz_clear (test_result); mpres_clear (test, modulus); #endif return 1; } void mpres_gcd (mpz_t R, const mpres_t S, const mpmod_t modulus) { /* In MODMULN and REDC form, M(x) = x*R with gcd(R, modulus) = 1 . Therefore gcd(M(x), modulus) = gcd(x, modulus) and we need not bother to convert out of Montgomery form. */ ASSERT_NORMALIZED (S); mpz_gcd (R, S, modulus->orig_modulus); } void mpres_out_str (FILE *fd, const unsigned int base, const mpres_t S, mpmod_t modulus) { mpres_get_z (modulus->temp2, S, modulus); mpz_out_str (fd, base, modulus->temp2); } int mpmod_selftest (const mpz_t n) { mpres_t test1, test2; mpmod_t modulus; printf ("Performing self test\n"); mpmod_init (modulus, n, 0); mpres_init (test1, modulus); mpres_init (test2, modulus); mpres_set_ui (test1, 2, modulus); mpres_set_ui (test2, 5, modulus); mpres_muldivbysomething_si (test1, test1, 5, modulus); mpres_muldivbysomething_si (test2, test2, 2, modulus); if (!mpres_equal (test1, test2, modulus)) { printf ("mpres_muldivbysomething_si() wrong\n"); fflush (stdout); abort(); } mpres_clear (test1, modulus); mpres_clear (test2, modulus); mpmod_clear (modulus); return 0; } /****************************************************/ /* mpresn: modular arithmetic based directly on mpn */ /****************************************************/ /* We use here a signed word-based redundant representation. In case N < B^n/16 (since for redc where we add to the absolute value of the residue), where n is the number of limbs of N in base B (2^32 or 2^64 usually), we can prove there is no adjustment (adding or subtracting N), cf http://www.loria.fr/~zimmerma/papers/norm.pdf. However current branch predictors are quite good, thus we prefer to keep the tests and to allow any input N (instead of only N < B^n/16). */ /* ensure R has allocated space for at least n limbs, and if less than n limbs are used, pad with zeros, and set SIZ(R) to n if positive or -n if negative */ void mpresn_pad (mpres_t R, mpmod_t N) { mp_size_t n = ABSIZ(N->orig_modulus); mp_size_t rn; _mpz_realloc (R, n); rn = mpz_size (R); ASSERT_ALWAYS (rn <= n); if (rn < n) { MPN_ZERO (PTR(R) + rn, n - rn); SIZ(R) = SIZ(R) >= 0 ? n : -n; } } void mpresn_unpad (mpres_t R) { mp_size_t n = ABSIZ(R); while (n > 0 && PTR(R)[n-1] == 0) n--; SIZ(R) = SIZ(R) >= 0 ? n : -n; } /* R <- S1 * S1 mod N, used only for ECM_MOD_MODMULN */ void mpresn_sqr (mpres_t R, const mpres_t S1, mpmod_t modulus) { mp_size_t n = ABSIZ(modulus->orig_modulus); ASSERT (SIZ(S1) == n || -SIZ(S1) == n); ecm_sqrredc_basecase_n (PTR(R), PTR(S1), PTR(modulus->orig_modulus), n, modulus->Nprim, PTR(modulus->temp1)); SIZ(R) = n; } /* R <- S1 * S2 mod N, used only for ECM_MOD_MODMULN */ void mpresn_mul (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mp_size_t n = ABSIZ(modulus->orig_modulus); ASSERT (SIZ(S1) == n || -SIZ(S1) == n); ASSERT (SIZ(S2) == n || -SIZ(S2) == n); ecm_mulredc_basecase_n (PTR(R), PTR(S1), PTR(S2), PTR(modulus->orig_modulus), n, modulus->Nprim, PTR(modulus->temp1)); SIZ(R) = SIZ(S1) == SIZ(S2) ? n : -n; } /* R <- S*m/B mod modulus where m fits in a mp_limb_t. Here S (w in dup_add_batch1) is the result of a subtraction, thus with the notations from http://www.loria.fr/~zimmerma/papers/norm.pdf we have S < 2 \alpha N. Then R < (2 \alpha N \beta + \beta N) = (2 \alpha + 1) N. This result R is used in an addition with u being the result of a squaring thus u < \alpha N, which gives a result < (3 \alpha + 1) N. Finally this result is used in a multiplication with another operand less than 2 \alpha N, thus we want: ((2 \alpha) (3 \alpha + 1) N^2 + \beta N)/\beta \leq \alpha N, i.e., 2 \alpha (3 \alpha + 1) \varepsilon + 1 \leq \alpha This implies \varepsilon \leq 7/2 - sqrt(3)/2 ~ 0.0359, in which case we can take \alpha = 2/3*sqrt(3)+1 ~ 2.1547. In that case no adjustment is needed in mpresn_mul_1. However we prefer to keep the adjustment here, to allow a larger set of inputs (\varepsilon \leq 1/16 = 0.0625 instead of 0.0359). */ void mpresn_mul_1 (mpres_t R, const mpres_t S, const mp_limb_t m, mpmod_t modulus) { mp_ptr t1 = PTR(modulus->temp1); mp_ptr t2 = PTR(modulus->temp2); mp_size_t n = ABSIZ(modulus->orig_modulus); mp_limb_t q; ASSERT (SIZ(S) == n || -SIZ(S) == n); ASSERT (ALLOC(modulus->temp1) >= n+1); #if defined(USE_ASM_REDC) && defined(HAVE_NATIVE_MULREDC1_N) if (n <= MULREDC_ASSEMBLY_MAX) mulredc_1 (PTR(R), m, PTR(S), PTR(modulus->orig_modulus), n, modulus->Nprim[0]); else #endif { t1[n] = mpn_mul_1 (t1, PTR(S), n, m); q = t1[0] * modulus->Nprim[0]; t2[n] = mpn_mul_1 (t2, PTR(modulus->orig_modulus), n, q); #ifdef HAVE___GMPN_ADD_NC q = __gmpn_add_nc (PTR(R), t1 + 1, t2 + 1, n, t1[0] != 0); #else q = mpn_add_n (PTR(R), t1 + 1, t2 + 1, n); q += mpn_add_1 (PTR(R), PTR(R), n, t1[0] != 0); #endif while (q != 0) q -= mpn_sub_n (PTR(R), PTR(R), PTR(modulus->orig_modulus), n); } SIZ(R) = SIZ(S); /* sign is unchanged */ } /* R <- S1 + S2 mod modulus */ /* we assume all numbers are allocated to n limbs, and unused most significant limbs are set to zero */ void mpresn_add (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mp_ptr r = PTR(R); mp_ptr s1 = PTR(S1); mp_ptr s2 = PTR(S2); mp_size_t n = ABSIZ(modulus->orig_modulus); ATTRIBUTE_UNUSED mp_limb_t cy; ASSERT (SIZ(S1) == n || -SIZ(S1) == n); ASSERT (SIZ(S2) == n || -SIZ(S2) == n); if (SIZ(S1) == SIZ(S2)) /* S1 and S2 are of same sign */ { cy = mpn_add_n (r, s1, s2, n); /* for N < B^n/16, the while loop will be never performed, which proves it will be performed a small number of times. In practice we observed up to 7 loops, but it happens rarely. */ #ifndef MPRESN_NO_ADJUSTMENT while (cy != 0) cy -= mpn_sub_n (r, r, PTR(modulus->orig_modulus), n); #endif SIZ(R) = SIZ(S1); } else /* different signs */ { if (mpn_cmp (s1, s2, n) >= 0) { mpn_sub_n (r, s1, s2, n); /* no borrow here */ SIZ(R) = SIZ(S1); } else { mpn_sub_n (r, s2, s1, n); /* idem */ SIZ(R) = SIZ(S2); } } } void mpresn_sub (mpres_t R, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mp_ptr r = PTR(R); mp_ptr s1 = PTR(S1); mp_ptr s2 = PTR(S2); mp_size_t n = ABSIZ(modulus->orig_modulus); ATTRIBUTE_UNUSED mp_limb_t cy; ASSERT (SIZ(S1) == n || -SIZ(S1) == n); ASSERT (SIZ(S2) == n || -SIZ(S2) == n); if (SIZ(S1) != SIZ(S2)) /* S1 and S2 are of different signs */ { cy = mpn_add_n (r, s1, s2, n); #ifndef MPRESN_NO_ADJUSTMENT while (cy != 0) cy -= mpn_sub_n (r, r, PTR(modulus->orig_modulus), n); #endif SIZ(R) = SIZ(S1); } else /* same signs, it's a real subtraction */ { if (mpn_cmp (s1, s2, n) >= 0) { mpn_sub_n (r, s1, s2, n); /* no borrow here */ SIZ(R) = SIZ(S1); } else { mpn_sub_n (r, s2, s1, n); /* idem */ SIZ(R) = -SIZ(S2); } } } /* (R, T) <- (S1 + S2, S1 - S2) Assume R differs from both S1 and S2. */ void mpresn_addsub (mpres_t R, mpres_t T, const mpres_t S1, const mpres_t S2, mpmod_t modulus) { mp_ptr r = PTR(R); mp_ptr t = PTR(T); mp_ptr s1 = PTR(S1); mp_ptr s2 = PTR(S2); mp_size_t n = ABSIZ(modulus->orig_modulus); ATTRIBUTE_UNUSED mp_limb_t cy; ASSERT (R != S1); ASSERT (R != S2); ASSERT (SIZ(S1) == n || -SIZ(S1) == n); ASSERT (SIZ(S2) == n || -SIZ(S2) == n); if (SIZ(S1) == SIZ(S2)) /* S1 and S2 are of same sign */ { cy = mpn_add_n (r, s1, s2, n); #ifndef MPRESN_NO_ADJUSTMENT while (cy != 0) cy -= mpn_sub_n (r, r, PTR(modulus->orig_modulus), n); #endif SIZ(R) = SIZ(S1); if (mpn_cmp (s1, s2, n) >= 0) { mpn_sub_n (t, s1, s2, n); /* no borrow since {s1,n} >= {s2,n} */ SIZ(T) = SIZ(S1); } else { mpn_sub_n (t, s2, s1, n); /* idem since {s2,n} >= {s1,n} */ SIZ(T) = -SIZ(S2); } } else /* different signs */ { if (mpn_cmp (s1, s2, n) >= 0) { mpn_sub_n (r, s1, s2, n); /* no borrow since {s1,n} >= {s2,n} */ SIZ(R) = SIZ(S1); } else { mpn_sub_n (r, s2, s1, n); /* idem since {s2,n} >= {s1,n} */ SIZ(R) = SIZ(S2); } cy = mpn_add_n (t, s1, s2, n); #ifndef MPRESN_NO_ADJUSTMENT while (cy != 0) cy -= mpn_sub_n (t, t, PTR(modulus->orig_modulus), n); #endif SIZ(T) = SIZ(S1); } } ecm-6.4.4/mpmod.h0000644023561000001540000000234612106741273010510 00000000000000/* Header for modular multiplication. Copyright 2012 Paul Zimmermann. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #define MPMOD_MULREDC 0 /* assembly combined mulredc */ #define MPMOD_MUL_REDC1 1 /* mpn_mul_n or mpn_sqr followed by mpn_redc_1 */ #define MPMOD_MUL_REDC2 2 /* mpn_mul_n or mpn_sqr followed by mpn_redc_2 */ #define MPMOD_MUL_REDCN 3 /* mpn_mul_n or mpn_sqr followed by mpn_redc_n */ #define MPMOD_MUL_REDC_C 4 /* mpn_mul_n or mpn_sqr followed by plain C redc */ ecm-6.4.4/ecm-params.h0000644023561000001540000000316612106741273011422 00000000000000/* produced on pasta.loria.fr (Intel(R) Core(TM)2 CPU 6700 @ 2.66GHz) */ #ifndef HAVE_MPIR /* tuning parameters for GMP, tuned for GMP 5.0.4 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,2,0,2,0,2,1,1,1,1,2,2,1,2,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 9, 10, 12, 11, 12, 13, 12, 12, 14, 16, 16, 16, 18, 18, 18} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 8 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 128 #else /* tuning parameters for MPIR, tuned for MPIR 2.5.1 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,1,1,2,2,1,1,1,1,1,1,2,1,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 0, 6, 6, 7, 8, 9, 9, 11, 10, 10, 11, 12, 13, 14, 14, 11, 13, 18, 18, 14, 20, 16, 18, 18, 20} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 32 #endif ecm-6.4.4/ecm-params.h.athlon640000644023561000001540000000315012106741274013052 00000000000000/* tuned on frite.loria.fr (AMD Phenom(tm) II X2 B55 Processor) */ #ifndef HAVE_MPIR /* tuning parameters for GMP, tuned with GMP 5.0.4 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,1,2,1,1,1,2} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,2,2,1,2,2,1,2,1,2,1,1,1,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 9, 10, 11, 10, 12, 12, 12, 14, 14, 16, 16, 16, 18, 19, 19, 20, 21, 18, 19} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 12 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 128 #define MPZSPV_NORMALISE_STRIDE 128 #else /* tuning parameters for MPIR, tuned with MPIR 2.5.1 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 20, 24} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 12 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 512 #define PREREVERTDIVISION_NTT_THRESHOLD 32 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 128 #endif ecm-6.4.4/COPYING0000644023561000001540000010451312106741273010255 00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ecm-6.4.4/main.c0000644023561000001540000015045512111676237010323 00000000000000/* GMP-ECM -- Integer factorization with ECM, P-1 and P+1 methods. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Jim Fougeron, Laurent Fousse, Alexander Kruppa, Paul Zimmermann, Cyril Bouvier. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #ifdef _MSC_VER # include #endif #include "ecm-impl.h" #include "ecm-ecm.h" #ifdef HAVE_UNISTD_H /* for access() */ # include #else # define F_OK 0 # ifdef HAVE_IO_H # include # endif #endif #ifdef HAVE_SIGNAL_H # include #endif #ifdef HAVE_GWNUM /* For GWNUM_VERSION */ #include "gwnum.h" #endif /* Used in print_config() */ #include "ecm-params.h" /* #define DEBUG */ #include "champions.h" /* probab_prime_p() can get called from other modules. Instead of passing prpcmd to those functions, we make it static here - this variable will be set only in main, and read only in probab_prime_p() */ #ifdef WANT_SHELLCMD static char *prpcmd = NULL; #endif static int exit_asap_value = 0; static int exit_asap_signalnr = 0; /* Remembers which signal we received */ void signal_handler (int sig) { if (sig == SIGINT || sig == SIGTERM) { exit_asap_value = 1; exit_asap_signalnr = sig; /* If one of these two signals arrives again, we'll let the default handler take over, which will usually terminate the process immediately. */ signal (SIGINT, SIG_DFL); signal (SIGTERM, SIG_DFL); } else { /* How did this happen? Let's ignore it for now, abort instead? */ } } int stop_asap_test () { return exit_asap_value; } static void usage (void) { printf ("Usage: ecm [options] B1 [[B2min-]B2] < file\n"); printf ("\nParameters:\n"); printf (" B1 stage 1 bound\n"); printf (" B2 stage 2 bound (or interval B2min-B2max)\n"); printf ("\nOptions:\n"); printf (" -x0 x use x as initial point\n"); printf (" -sigma s use s as curve generator [ecm]\n"); printf (" -A a use a as curve parameter [ecm]\n"); printf (" -k n perform >= n steps in stage 2\n"); printf (" -power n use x^n for Brent-Suyama's extension\n"); printf (" -dickson n use n-th Dickson's polynomial for Brent-Suyama's extension\n"); printf (" -c n perform n runs for each input\n"); printf (" -pm1 perform P-1 instead of ECM\n"); printf (" -pp1 perform P+1 instead of ECM\n"); printf (" -q quiet mode\n"); printf (" -v verbose mode\n"); printf (" -timestamp print a time stamp with each number\n"); printf (" -mpzmod use GMP's mpz_mod for modular reduction\n"); printf (" -modmuln use Montgomery's MODMULN for modular reduction\n"); printf (" -redc use Montgomery's REDC for modular reduction\n"); printf (" -nobase2 disable special base-2 code\n"); printf (" -nobase2s2 disable special base-2 code in ecm stage 2 only\n"); printf (" -base2 n force base 2 mode with 2^n+1 (n>0) or 2^|n|-1 (n<0)\n"); printf (" -ntt enable NTT convolution routines in stage 2\n"); printf (" -no-ntt disable NTT convolution routines in stage 2\n"); printf (" -save file save residues at end of stage 1 to file\n"); printf (" -savea file like -save, appends to existing files\n"); printf (" -resume file resume residues from file, reads from stdin if file is \"-\"\n"); printf (" -chkpnt file save periodic checkpoints during stage 1 to file\n"); printf (" -primetest perform a primality test on input\n"); printf (" -treefile f [ECM only] store stage 2 data in files f.0, ... \n"); printf (" -maxmem n use at most n MB of memory in stage 2\n"); printf (" -stage1time n add n seconds to ECM stage 1 time (for expected time est.)\n"); #ifdef WANT_SHELLCMD printf (" -faccmd cmd execute cmd when factor is found. Input number, factor\n" " and cofactor are given to cmd via stdin, each on a line\n"); printf (" -prpcmd cmd use shell command cmd to do prp tests (number via stdin)\n"); printf (" -idlecmd cmd before each curve run cmd and terminate if exit code >0\n"); #endif /*printf (" -extra functions added by JimF\n"); */ printf (" -i n increment B1 by this constant on each run\n"); printf (" -I f auto-calculated increment for B1 multiplied by 'f' scale factor\n"); printf (" -inp file Use file as input (instead of redirecting stdin)\n"); printf (" -b Use breadth-first mode of file processing\n"); printf (" -d Use depth-first mode of file processing (default)\n"); printf (" -one Stop processing a candidate if a factor is found (looping mode)\n"); printf (" -n run ecm in \"nice\" mode (below normal priority)\n"); printf (" -nn run ecm in \"very nice\" mode (idle priority)\n"); printf (" -ve n Verbosely show short (< n character) expressions on each loop\n"); printf (" -cofdec Force cofactor output in decimal (even if expressions are used)\n"); printf (" -B2scale f Multiplies the default B2 value by f \n"); printf (" -go val Preload with group order val, which can be a simple expression,\n"); printf (" or can use N as a placeholder for the number being factored.\n"); printf (" -printconfig Print compile-time configuration and exit.\n"); printf (" -batch[=1|2] (experimental) use Montgomery parametrization and batch\n" " computation. Option -batch is equivalent to -batch=1\n"); printf (" -bsaves file In the batch mode, save s in file.\n"); printf (" -bloads file In the batch mode, load s from file.\n"); printf (" -h, --help Prints this help and exit.\n"); } /* Print parameters that were used to build GMP-ECM */ static void print_config () { printf ("Compilation options:\n"); #ifdef __MPIR_VERSION printf ("Included MPIR header files version %d.%d.%d\n", __MPIR_VERSION, __MPIR_VERSION_MINOR, __MPIR_VERSION_PATCHLEVEL); #else /* __MPIR_VERSION */ #ifdef __GNU_MP_VERSION_PATCHLEVEL printf ("Included GMP header files version %d.%d.%d\n", __GNU_MP_VERSION, __GNU_MP_VERSION_MINOR, __GNU_MP_VERSION_PATCHLEVEL); #else printf ("Included GMP header files version %d.%d\n", __GNU_MP_VERSION, __GNU_MP_VERSION_MINOR); #endif #endif /* __MPIR_VERSION */ #ifdef GWNUM_VERSION printf ("Included GWNUM header files version %s\n", GWNUM_VERSION); #else printf ("GWNUM_VERSION undefined\n"); #endif #ifdef HAVE_SSE2 printf ("HAVE_SSE2 = %d\n", HAVE_SSE2); #else printf ("HAVE_SSE2 undefined\n"); #endif #ifdef HAVE___GMPN_ADD_NC printf ("HAVE___GMPN_ADD_NC = %d\n", HAVE___GMPN_ADD_NC); #else printf ("HAVE___GMPN_ADD_NC undefined\n"); #endif #ifdef HAVE___GMPN_MOD_34LSUB1 printf ("HAVE___GMPN_MOD_34LSUB1 = %d\n", HAVE___GMPN_MOD_34LSUB1); #else printf ("HAVE___GMPN_MOD_34LSUB1 undefined\n"); #endif #ifdef HAVE___GMPN_REDC_1 printf ("HAVE___GMPN_REDC_1 = %d\n", HAVE___GMPN_REDC_1); #else printf ("HAVE___GMPN_REDC_1 undefined\n"); #endif #ifdef MEMORY_DEBUG printf ("MEMORY_DEBUG = %d\n", MEMORY_DEBUG); #else printf ("MEMORY_DEBUG undefined\n"); #endif #ifdef USE_ASM_REDC printf ("USE_ASM_REDC = %d\n", USE_ASM_REDC); #ifdef WINDOWS64_ABI printf ("WINDOWS64_ABI = %d\n", WINDOWS64_ABI); #else printf ("WINDOWS64_ABI undefined\n"); #endif #else printf ("USE_ASM_REDC undefined\n"); #endif #ifdef WANT_ASSERT printf ("WANT_ASSERT = %d\n", WANT_ASSERT); #else printf ("WANT_ASSERT undefined\n"); #endif #ifdef WANT_SHELLCMD printf ("WANT_SHELLCMD = %d\n", WANT_SHELLCMD); #else printf ("WANT_SHELLCMD undefined\n"); #endif #ifdef _OPENMP printf ("_OPENMP = %d\n", _OPENMP); #else printf ("_OPENMP undefined\n"); #endif #ifdef MPZMOD_THRESHOLD printf ("MPZMOD_THRESHOLD = %d\n", MPZMOD_THRESHOLD); #else printf ("MPZMOD_THRESHOLD undefined\n"); #endif #ifdef REDC_THRESHOLD printf ("REDC_THRESHOLD = %d\n", REDC_THRESHOLD); #else printf ("REDC_THRESHOLD undefined\n"); #endif #ifdef MUL_NTT_THRESHOLD printf ("MUL_NTT_THRESHOLD = %d\n", MUL_NTT_THRESHOLD); #else printf ("MUL_NTT_THRESHOLD undefined\n"); #endif #ifdef NTT_GFP_TWIDDLE_DIF_BREAKOVER printf ("NTT_GFP_TWIDDLE_DIF_BREAKOVER = %d\n", NTT_GFP_TWIDDLE_DIF_BREAKOVER); #else printf ("NTT_GFP_TWIDDLE_DIF_BREAKOVER undefined\n"); #endif #ifdef NTT_GFP_TWIDDLE_DIT_BREAKOVER printf ("NTT_GFP_TWIDDLE_DIT_BREAKOVER = %d\n", NTT_GFP_TWIDDLE_DIT_BREAKOVER); #else printf ("NTT_GFP_TWIDDLE_DIT_BREAKOVER undefined\n"); #endif #ifdef PREREVERTDIVISION_NTT_THRESHOLD printf ("PREREVERTDIVISION_NTT_THRESHOLD = %d\n", PREREVERTDIVISION_NTT_THRESHOLD); #else printf ("PREREVERTDIVISION_NTT_THRESHOLD undefined\n"); #endif #ifdef POLYINVERT_NTT_THRESHOLD printf ("POLYINVERT_NTT_THRESHOLD = %d\n", POLYINVERT_NTT_THRESHOLD); #else printf ("POLYINVERT_NTT_THRESHOLD undefined\n"); #endif #ifdef POLYEVALT_NTT_THRESHOLD printf ("POLYEVALT_NTT_THRESHOLD = %d\n", POLYEVALT_NTT_THRESHOLD); #else printf ("POLYEVALT_NTT_THRESHOLD undefined\n"); #endif #ifdef MPZSPV_NORMALISE_STRIDE printf ("MPZSPV_NORMALISE_STRIDE = %d\n", MPZSPV_NORMALISE_STRIDE); #else printf ("MPZSPV_NORMALISE_STRIDE undefined\n"); #endif } /****************************************************************************** * * * Main program * * * ******************************************************************************/ int main (int argc, char *argv[]) { char **argv0 = argv; mpz_t seed, x, sigma, A, f, orig_x0, B2, B2min, startingB2min; mpcandi_t n; mpgocandi_t go; mpq_t rat_x0; double B1, B1done; int result = 0, returncode = 0; int verbose = OUTPUT_NORMAL; /* verbose level */ int timestamp = 0; int method = ECM_ECM, method1; int use_ntt = 1; /* Default, use NTT if input is small enough */ int specific_x0 = 0, /* 1=starting point supplied by user, 0=random or */ /* compute from sigma */ specific_sigma = 0; /* 1=sigma from command line, 0=make random */ int factor_is_prime; /* If a factor was found, indicate whether factor, cofactor are */ /* prime. If no factor was found, both are zero. */ int repr = ECM_MOD_DEFAULT; /* automatic choice */ int nobase2step2 = 0; /* flag to turn off base 2 arithmetic in ecm stage 2 */ unsigned long k = ECM_DEFAULT_K; /* default number of blocks in stage 2 */ int S = ECM_DEFAULT_S; /* Degree for Brent-Suyama extension requested by user. Positive value: use S-th power, negative: use degree |S| Dickson poly, default (0): automatic choice. */ gmp_randstate_t randstate; char *savefilename = NULL, *resumefilename = NULL, *infilename = NULL; char *TreeFilename = NULL, *chkfilename = NULL; char rtime[256] = "", who[256] = "", comment[256] = "", program[256] = ""; FILE *resumefile = NULL, *infile = NULL; mpz_t resume_lastN, resume_lastfac; /* When resuming residues from a file, store the last number processed and the factors found for this it */ int resume_wasPrp = 0; /* 1 if resume_lastN/resume_lastfac is a PRP */ int primetest = 0, saveappend = 0; double autoincrementB1 = 0.0, startingB1; unsigned int autoincrementB1_calc = 0; unsigned int breadthfirst_maxcnt=0, breadthfirst_cnt=0; int breadthfirst = 0; unsigned int count = 1; /* number of curves for each number */ unsigned int cnt = 0; /* number of remaining curves for current number */ unsigned int linenum = 0, factsfound = 0; mpcandi_t *pCandidates = NULL; unsigned int nCandidates=0, nMaxCandidates=0; int deep=1, trial_factor_found; unsigned int displayexpr = 0; unsigned int decimal_cofactor = 0; double B2scale = 1.0; double maxmem = 0.; double stage1time = 0.; ecm_params params; int batch = 0; /* By default we don't use batch mode */ char *savefile_s = NULL; char *loadfile_s = NULL; #ifdef WANT_SHELLCMD char *faccmd = NULL; char *idlecmd = NULL; #endif #ifdef HAVE_GWNUM double gw_k = 0.0; /* set default values for gwnum poly k*b^n+c */ unsigned long gw_b = 0; /* set default values for gwnum poly k*b^n+c */ unsigned long gw_n = 0; /* set default values for gwnum poly k*b^n+c */ signed long gw_c = 0; /* set default values for gwnum poly k*b^n+c */ #endif /* check ecm is linked with a compatible library */ if (mp_bits_per_limb != GMP_NUMB_BITS) { fprintf (stderr, "Error, mp_bits_per_limb and GMP_NUMB_BITS differ\n"); fprintf (stderr, "Please check your LD_LIBRARY_PATH variable\n"); exit (1); } #ifdef MEMORY_DEBUG tests_memory_start (); #endif ecm_init (params); /* initialize the group order candidate */ mpgocandi_t_init (&go); /* Init variables we might need to store options */ MPZ_INIT (seed); MPZ_INIT (sigma); MPZ_INIT (A); MPZ_INIT (B2); MPZ_INIT (B2min); MPZ_INIT (startingB2min); mpq_init (rat_x0); /* first look for options */ while ((argc > 1) && (argv[1][0] == '-')) { if (strcmp (argv[1], "-pm1") == 0) { method = ECM_PM1; argv++; argc--; } else if (strcmp (argv[1], "-pp1") == 0) { method = ECM_PP1; argv++; argc--; } else if (strcmp (argv[1], "-q") == 0) { verbose = OUTPUT_ALWAYS; argv++; argc--; } else if (strcmp (argv[1], "-v") == 0) { verbose ++; argv++; argc--; } else if (strcmp (argv[1], "-timestamp") == 0) { timestamp = 1; argv++; argc--; } else if (strcmp (argv[1], "-mpzmod") == 0) { repr = ECM_MOD_MPZ; argv++; argc--; } else if (strcmp (argv[1], "-modmuln") == 0) { repr = ECM_MOD_MODMULN; argv++; argc--; } else if (strcmp (argv[1], "-redc") == 0) { repr = ECM_MOD_REDC; argv++; argc--; } else if (strcmp (argv[1], "-nobase2") == 0) { repr = ECM_MOD_NOBASE2; argv++; argc--; } else if (strcmp (argv[1], "-nobase2s2") == 0) { nobase2step2 = 1; argv++; argc--; } else if (strcmp (argv[1], "-ntt") == 0) { use_ntt = 2; /* Use NTT, even for large input numbers */ argv++; argc--; } else if (strcmp (argv[1], "-no-ntt") == 0) { use_ntt = 0; /* Never use NTT */ argv++; argc--; } else if (strcmp (argv[1], "-primetest") == 0) { primetest = 1; argv++; argc--; } else if (strcmp (argv[1], "-one") == 0) { deep = 0; argv++; argc--; } else if (strcmp (argv[1], "-b") == 0) { breadthfirst = 1; argv++; argc--; } else if (strcmp (argv[1], "-batch") == 0 || strcmp (argv[1], "-batch=1") == 0) { batch = 1; argv++; argc--; } else if (strcmp (argv[1], "-batch=2") == 0) { batch = 2; argv++; argc--; } else if ((argc > 2) && (strcmp (argv[1], "-bsaves") == 0)) { savefile_s = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-bloads") == 0)) { loadfile_s = argv[2]; argv += 2; argc -= 2; } else if (strcmp (argv[1], "-h") == 0 || strcmp (argv[1], "--help") == 0) { usage (); exit (EXIT_SUCCESS); } else if (strcmp (argv[1], "-printconfig") == 0) { print_config (); exit (EXIT_SUCCESS); } else if (strcmp (argv[1], "-d") == 0) { /* -1 is a flag used during argv processing where a subsquent -i file will NOT change it. Then when done processing args, we change a -1 to a 0 */ breadthfirst = -1; argv++; argc--; } else if (strcmp (argv[1], "-cofdec") == 0) { decimal_cofactor = 1; argv++; argc--; } else if (strcmp (argv[1], "-n") == 0) { NICE10; argv++; argc--; } else if (strcmp (argv[1], "-nn") == 0) { NICE20; argv++; argc--; } else if ((argc > 2) && (strcmp (argv[1], "-x0")) == 0) { if (mpq_set_str (rat_x0, argv[2], 0)) { fprintf (stderr, "Error, invalid starting point: %s\n", argv[2]); exit (EXIT_FAILURE); } specific_x0 = 1; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-sigma")) == 0) { if (mpz_set_str (sigma, argv[2], 0) || mpz_cmp_ui (sigma, 6) < 0) { fprintf (stderr, "Error, invalid sigma value: %s\n", argv[2]); exit (EXIT_FAILURE); } specific_sigma = 1; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-A")) == 0) { if (mpz_set_str (A, argv[2], 0)) { fprintf (stderr, "Error, invalid A value: %s\n", argv[2]); exit (EXIT_FAILURE); } argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-power")) == 0) { S = abs (atoi (argv[2])); /* should this be validated? and a error/abort issued if 0 ??? */ argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-dickson") == 0)) { S = - abs ( atoi (argv[2])); /* should this be validated? and a error/abort issued if 0 ??? */ argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-k") == 0)) { k = atol (argv[2]); /* should this be validated? and a error/abort issued if 0 ??? */ argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-c") == 0)) { count = atoi (argv[2]); /* should this be validated? and a error/abort issued if 0 ??? */ argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-save") == 0)) { savefilename = argv[2]; saveappend = 0; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-savea") == 0)) { savefilename = argv[2]; saveappend = 1; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-resume") == 0)) { resumefilename = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-chkpnt") == 0)) { chkfilename = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-treefile") == 0)) { TreeFilename = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-base2") == 0)) { int b = atoi (argv[2]); if (abs (b) >= 16) /* |Values| < 16 are reserved for other methods */ repr = b; /* keep method unchanged in that case */ argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-i") == 0)) { autoincrementB1 = strtod (argv[2], NULL); if (autoincrementB1 < 1.0) { fprintf (stderr, "Error, the -i n option requires n >= 1\n"); exit (EXIT_FAILURE); } argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-I") == 0)) { autoincrementB1 = strtod (argv[2], NULL); autoincrementB1_calc = 1; if (autoincrementB1 <= 0.0) { fprintf (stderr, "Error, the -I f option requires f > 0\n"); exit (EXIT_FAILURE); } argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-inp") == 0)) { infilename = argv[2]; infile = fopen (infilename, "r"); /* a -d depth-first switch has already been processed, so DO NOT reset to breadth-first */ if (breadthfirst != -1) breadthfirst = 1; if (!infile) { fprintf (stderr, "Can't find input file %s\n", infilename); exit (EXIT_FAILURE); } argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-ve") == 0)) { displayexpr = atoi (argv[2]); if (displayexpr == 0) { fprintf (stderr, "Error, the -ve option requires a number argument\n"); exit (EXIT_FAILURE); } argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-B2scale") == 0)) { B2scale = atof (argv[2]); if (verbose >= 2) printf ("Scaling B2 values by a factor of %.4f\n", B2scale); argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-maxmem") == 0)) { maxmem = atof (argv[2]) * 1048576.; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-stage1time") == 0)) { stage1time = atof (argv[2]); argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-go") == 0)) { if (go.cpOrigExpr) { fprintf (stderr, "Warning, for multiple -go options, only the last one is taken into account.\n"); free (go.cpOrigExpr); } go.cpOrigExpr = malloc (strlen (argv[2]) + 1); if (go.cpOrigExpr == NULL) { fprintf (stderr, "Cannot allocate memory in main\n"); exit (1); } strcpy (go.cpOrigExpr, argv[2]); if (strchr (go.cpOrigExpr, 'N')) { go.containsN = 1; go.Valid = 1; /* we actually do not know if it is valid here, but we "assume" until the first time it gets run through */ } else { go.containsN = 0; /* have "fully" parsed expr or number. Do not recompute for each N */ if (eval_str (&(go.Candi), go.cpOrigExpr, 0, NULL)) go.Valid = 1; } argv += 2; argc -= 2; } #ifdef WANT_SHELLCMD else if ((argc > 2) && (strcmp (argv[1], "-prpcmd") == 0)) { prpcmd = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-faccmd") == 0)) { faccmd = argv[2]; argv += 2; argc -= 2; } else if ((argc > 2) && (strcmp (argv[1], "-idlecmd") == 0)) { idlecmd = argv[2]; argv += 2; argc -= 2; } #endif else { fprintf (stderr, "Unknown option: %s\n", argv[1]); exit (EXIT_FAILURE); } } /* check that S is even for old P-1 stage 2 */ if ((method == ECM_PM1) && (S != ECM_DEFAULT_S) && (S % 2 != 0)) { fprintf (stderr, "Error, S should be even for P-1\n"); exit (EXIT_FAILURE); } /* Ok, now we can "reset" the breadthfirst switch so that we do depthfirst as requested */ if (breadthfirst == -1) breadthfirst = 0; if (argc < 2) { fprintf (stderr, "Invalid arguments. See %s --help.\n", argv0[0]); exit (EXIT_FAILURE); } /* start of the program */ if (verbose >= 1) { char Gmp_version[64]; char out0[128], *out = out0; #ifdef __MPIR_VERSION sprintf (Gmp_version, "MPIR %d.%d.%d", __MPIR_VERSION, __MPIR_VERSION_MINOR, __MPIR_VERSION_PATCHLEVEL); #else /* original GMP */ sprintf (Gmp_version, "GMP %s", gmp_version); #endif /* __MPIR_VERSION */ out += sprintf (out, "GMP-ECM %s [configured with %s", VERSION, Gmp_version); #ifdef HAVE_GWNUM out += sprintf (out, ", GWNUM %s", GWNUM_VERSION); #endif #ifdef USE_ASM_REDC out += sprintf (out, ", --enable-asm-redc"); #endif #ifdef WANT_ASSERT out += sprintf (out, ", --enable-assert"); #endif printf ("%s] [", out0); switch (method) { case ECM_PM1: printf ("P-1"); break; case ECM_PP1: printf ("P+1"); break; default: printf ("ECM"); } printf ("]\n"); #ifdef HAVE_GETHOSTNAME if (verbose >= 2) { #define MNAMESIZE 64 char mname[MNAMESIZE]; if (gethostname (mname, MNAMESIZE) == 0) { mname[MNAMESIZE - 1] = 0; /* gethostname() may omit trailing 0 */ printf ("Running on %s\n", mname); } } #endif #ifdef HAVE_GWNUM #ifdef gwnum_is_gpl if (! gwnum_is_gpl()) #endif printf ("Due to incompatible licenses, this binary file must not " "be distributed.\n"); #endif } /* set first stage bound B1 */ B1 = strtod (argv[1], &argv[1]); if (*argv[1] == '-') { B1done = B1; B1 = strtod (argv[1] + 1, NULL); } else B1done = ECM_DEFAULT_B1_DONE; mpz_set_si (B2min, -1); /* default, means that B2min will be set to B1 by ecm(), pm1() and pp1() */ if (B1 < 0.0 || B1done < 0.0) { fprintf (stderr, "Bound values must be positive\n"); exit (EXIT_FAILURE); } /* check B1 is not too large */ if (B1 > MAX_B1) { fprintf (stderr, "Too large stage 1 bound, limit is %1.0f\n", MAX_B1); exit (EXIT_FAILURE); } init_expr (); mpz_set_si (B2, ECM_DEFAULT_B2); /* compute it automatically from B1 */ /* parse B2 or B2min-B2max */ if (argc >= 3) { int c; double d; char *endptr; /* This is like strtok, but SunOS does not seem to have it declared in any header files, in spite of saying it does in the man pages... */ for (endptr = argv[2]; *endptr != '\0' && *endptr != '-'; endptr++); if (*endptr == '-') *(endptr++) = '\0'; else endptr = NULL; c = -1; { int r; r = gmp_sscanf (argv[2], "%Zd%n", B2, &c); /* Try parsing as integer */ if (r <= 0) { /* restore original value */ if (endptr != NULL) *(--endptr) = '-'; fprintf (stderr, "Invalid B2 value: %s\n", argv[2]); exit (EXIT_FAILURE); } } #ifdef __MINGW32__ /* MinGW scanf() returns a value 1 too high for %n */ /* Reported to MinGW as bug number 1163607 */ if (c > 0 && argv[2][c - 1] == 0) c--; #endif if (c < 0 || argv[2][c] != '\0') { c = -1; gmp_sscanf (argv[2], "%lf%n", &d, &c); /* Try parsing scientific */ #ifdef __MINGW32__ if (c > 0 && argv[2][c - 1] == 0) c--; #endif mpz_set_d (B2, d); } if (c < 0 || argv[2][c] != '\0' || argv[2][0] == '\0') /* If not the whole token could be parsed either way, or if there was no token to begin with (i.e string starting with '-') signal error */ c = -1; else if (endptr != NULL) /* Did we have a '-' in there? */ { mpz_set (B2min, B2); c = -1; gmp_sscanf (endptr, "%Zd%n", B2, &c); #ifdef __MINGW32__ if (c > 0 && endptr[c - 1] == 0) c--; #endif if (c < 0 || endptr[c] != '\0') { gmp_sscanf (endptr, "%lf%n", &d, &c); #ifdef __MINGW32__ if (c > 0 && endptr[c - 1] == 0) c--; #endif mpz_set_d (B2, d); } if (c < 0 || endptr[c] != '\0') c = -1; } if (c == -1) { fprintf (stderr, "Error: expected positive integer(s) B2 or " "B2min-B2\n"); exit (EXIT_FAILURE); } } /* set static parameters (i.e. those that don't change during the program) */ params->verbose = verbose; params->method = method; mpz_set (params->B2, B2); params->k = k; params->S = S; params->repr = repr; params->nobase2step2 = nobase2step2; params->chkfilename = chkfilename; params->TreeFilename = TreeFilename; params->maxmem = maxmem; params->stage1time = stage1time; /* -treefile is valid for ECM only */ if (TreeFilename != NULL && method != ECM_ECM) { fprintf (stderr, "Error: the -treefile option is for ECM only\n"); exit (EXIT_FAILURE); } /* Open resume file for reading, if resuming is requested */ if (resumefilename != NULL) { if (strcmp (resumefilename, "-") == 0) resumefile = stdin; else resumefile = fopen (resumefilename, "r"); if (resumefile == NULL) { fprintf (stderr, "Could not open file %s for reading\n", resumefilename); exit (EXIT_FAILURE); } mpz_init (resume_lastN); mpz_init (resume_lastfac); mpz_set_ui (resume_lastfac, 1); } /* Open save file for writing, if saving is requested */ if (savefilename != NULL) { FILE *savefile; /* Are we not appending and does this file already exist ? */ if (!saveappend && access (savefilename, F_OK) == 0) { printf ("Save file %s already exists, will not overwrite\n", savefilename); exit (EXIT_FAILURE); } /* Test if we can open the file for writing */ savefile = fopen (savefilename, "a"); if (savefile == NULL) { fprintf (stderr, "Could not open file %s for writing\n", savefilename); exit (EXIT_FAILURE); } fclose (savefile); } if (resumefile && (specific_sigma || mpz_sgn (A) || specific_x0)) { printf ("Warning: -sigma, -A and -x0 parameters are ignored when resuming from\nsave files.\n"); mpz_set_ui (sigma, 0); mpz_set_ui (A, 0); specific_x0 = 0; } mpcandi_t_init (&n); /* number(s) to factor */ MPZ_INIT (f); /* factor found */ MPZ_INIT (x); /* stage 1 residue */ MPZ_INIT (orig_x0); /* starting point, for save file */ /* We may need random numbers for sigma/starting point */ gmp_randinit_default (randstate); mpz_set_ui (seed, get_random_ul ()); if (mpz_sizeinbase (seed, 2) <= 32) { mpz_mul_2exp (seed, seed, 32); mpz_add_ui (seed, seed, get_random_ul ()); } if (verbose >= 3) gmp_printf ("Random seed: %Zd\n", seed); gmp_randseed (randstate, seed); /* Install signal handlers */ #ifdef HAVE_SIGNAL /* We catch signals only if there is a savefile. Otherwise there's nothing we could save by exiting cleanly, but the waiting for the code to check for signals may delay program end unacceptably */ if (savefilename != NULL) { signal (SIGINT, &signal_handler); signal (SIGTERM, &signal_handler); params->stop_asap = &stop_asap_test; } #endif /* loop for number in standard input or file */ startingB1 = B1; mpz_set (startingB2min, B2min); if (!infilename) infile = stdin; if (breadthfirst == 1) { breadthfirst_maxcnt = count; count = 1; breadthfirst_cnt = 0; } BreadthFirstDoAgain:; if (breadthfirst == 1) { if (breadthfirst_maxcnt > breadthfirst_cnt) { linenum = 0; if (breadthfirst_cnt++) { double NewB1; NewB1 = calc_B1_AutoIncrement (B1, autoincrementB1, autoincrementB1_calc); if (mpz_cmp_d (B2min, B1) <= 0) /* floating-point equality is unreliable, a comparison might be better */ mpz_set_d (B2min, NewB1); B1 = NewB1; } else { /* This is ONLY entered upon the first time through. We load the entire file here so that we can loop deep, or remove a candidate if factor found, or if in deep mode and cofactor is prp (or if original candidate is prp and we are prp testing) */ nMaxCandidates = 100; pCandidates = (mpcandi_t*) malloc (nMaxCandidates * sizeof(mpcandi_t)); if (pCandidates == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } while (!feof (infile)) { if (read_number (&n, infile, primetest)) { mpcandi_t_init (&pCandidates[nCandidates]); mpcandi_t_copy (&pCandidates[nCandidates++], &n); if (nCandidates == nMaxCandidates) { mpcandi_t *tmp = pCandidates; pCandidates = (mpcandi_t*) malloc ((nMaxCandidates + 100) * sizeof(mpcandi_t)); if (pCandidates == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } /* perform a "shallow" copy, in which we do NOT need to free any of the individual elements, but just the array memory */ if (pCandidates) memcpy (pCandidates, tmp, nMaxCandidates*sizeof(mpcandi_t)); nMaxCandidates += 100; /* Free the original "array" memory */ free (tmp); } } } /* Now infile is at EOF, but we are in breadthfirst mode, so the main while loop will work with linenum 0 || feof (infile) == 0) && !exit_asap_value) { trial_factor_found = 0; params->B1done = B1done; /* may change with resume */ if (resumefile != NULL) /* resume case */ { if (count != 1) { fprintf (stderr, "Error, option -c and -resume are incompatible\n"); exit (EXIT_FAILURE); } if (!read_resumefile_line (&method, x, &n, sigma, A, orig_x0, &(params->B1done), program, who, rtime, comment, resumefile)) break; if (mpz_cmp (n.n, resume_lastN) == 0) { /* Aha, we're trying the same number again. */ /* We skip this attempt if: 1. the remaining cofactor after the last attempt was a probable prime, or 2. if a factor was found and the user gave the -one option */ if (resume_wasPrp || (deep == 0 && mpz_cmp_ui (resume_lastfac, 1) != 0)) continue; /* If we found a factor in an earlier attempt, divide it out */ if (mpz_cmp_ui (resume_lastfac, 1) > 0) mpcandi_t_addfoundfactor (&n, resume_lastfac, 1); } else { /* It's a different number. Set resume_lastN and resume_lastfac */ mpz_set (resume_lastN, n.n); mpz_set_ui (resume_lastfac, 1); resume_wasPrp = n.isPrp; } cnt = count; /* i.e. 1 */ if (verbose >= 1) { printf ("Resuming "); if (method == ECM_ECM) printf ("ECM"); else if (method == ECM_PM1) printf ("P-1"); else if (method == ECM_PP1) printf ("P+1"); printf (" residue "); if (program[0] || who[0] || rtime[0]) printf ("saved "); if (who[0]) printf ("by %s ", who); if (program[0]) printf ("with %s ", program); if (rtime[0]) printf ("on %s ", rtime); if (comment[0]) printf ("(%s)", comment); printf ("\n"); } } else /* no-resume case */ { if (cnt) /* nothing to read: reuse old number */ { if (verbose >= OUTPUT_NORMAL) printf ("Run %u out of %u:\n", count - cnt + 1, count); } else /* new number */ { if (!breadthfirst && !read_number (&n, infile, primetest)) break; else if (breadthfirst) mpcandi_t_copy (&n,&pCandidates[linenum]); linenum++; cnt = count; /* reset B1 (and B2min) values, as they could have been advanced on the prior candidate */ if (!breadthfirst) { B1 = startingB1; mpz_set (B2min, startingB2min); } } /* in breadthfirst deep mode, a value of 1 is left after FULLY factoring the number, so we then skip it */ /* Also "blank" lines, or lines that could not be parsed correctly will leave a 1 in this value */ if (n.isPrp) { /* n is 0 or 1 (or -1 I guess) so do NOT proceed with it */ cnt = 0; continue; } /* Set effective seed for factoring attempt on this number */ if (specific_x0) /* convert rational value to integer */ { mpz_t inv; if (count != 1) { fprintf (stderr, "Error, option -c is incompatible with -x0\n"); exit (EXIT_FAILURE); } MPZ_INIT (inv); mpz_invert (inv, mpq_denref (rat_x0), n.n); mpz_mul (inv, mpq_numref (rat_x0), inv); mpz_mod (x, inv, n.n); mpz_clear (inv); } else /* Make a random starting point for P-1 and P+1. ECM will */ /* compute a suitable value from sigma or A if x is zero */ { if (method == ECM_ECM) mpz_set_ui (x, 0); if (method == ECM_PP1) pp1_random_seed (x, n.n, randstate); if (method == ECM_PM1) pm1_random_seed (x, n.n, randstate); } if (ECM_IS_DEFAULT_B1_DONE(B1done)) mpz_set (orig_x0, x); /* Make a random sigma if we have neither specific sigma nor A given. Warning: sigma may still contain previous random value and thus be nonzero here even if no specific sigma was given */ if (method == ECM_ECM && !specific_sigma && !mpz_sgn (A)) { /* Make random sigma, 0 < sigma <= 2^32 */ mpz_urandomb (sigma, randstate, 32); mpz_add_ui (sigma, sigma, 6); /* we need sigma >= 6 */ } } if (verbose >= 1) { if ((!breadthfirst && cnt == count) || (breadthfirst && 1 == breadthfirst_cnt)) { /* first time this candidate has been run (if looping more than once */ if (n.cpExpr && n.nexprlen < MAX_NUMBER_PRINT_LEN) printf ("Input number is %s (%u digits)\n", n.cpExpr, n.ndigits); else if (n.ndigits < MAX_NUMBER_PRINT_LEN) { char *s; s = mpz_get_str (NULL, 10, n.n); printf ("Input number is %s (%u digits)\n", s, n.ndigits); FREE (s, n.ndigits + 1); } else { /* Print only first and last ten digits of the number */ mpz_t t, u; mpz_init (t); mpz_init (u); mpz_ui_pow_ui (u, 5, n.ndigits - 10); mpz_tdiv_q_2exp (t, n.n, n.ndigits - 10); mpz_tdiv_q (t, t, u); gmp_printf ("Input number is %Zd...", t); mpz_ui_pow_ui (u, 10, 10); mpz_tdiv_r (t, n.n, u); gmp_printf ("%Zd (%u digits)\n", t, n.ndigits); mpz_clear (u); mpz_clear (t); } if (n.isPrp) printf ("****** Warning: input is probably prime ******\n"); } else /* 2nd or more try for same composite */ { /* Since the expression is usually "so" short, why not just drop it out for ALL loops? */ if (displayexpr) { if (n.nexprlen && n.nexprlen <= displayexpr) printf ("Input number is %s (%u digits)\n", n.cpExpr, n.ndigits); else if (n.ndigits <= displayexpr) { char *s; s = mpz_get_str (NULL, 10, n.n); printf ("Input number is %s (%u digits)\n", s, n.ndigits); FREE (s, n.ndigits + 1); } } } fflush (stdout); } /* Even in verbose=0 we should primality check if told to do so, however, we will print to stderr to keep stdout "clean" for verbose=0 like behavior */ else if (((!breadthfirst && cnt == count) || (breadthfirst && breadthfirst_cnt==1)) && n.isPrp) { char *s; s = mpz_get_str (NULL, 10, n.n); fprintf (stderr, "Input number is %s (%u digits)\n" "****** Warning: input is probably prime ******\n", s, n.ndigits); FREE (s, n.ndigits + 1); } factor_is_prime = 0; cnt --; /* one more curve performed */ mpgocandi_fixup_with_N (&go, &n); /* If we are in batch mode: If A was given one should check that d fits in one word and that x0=2. If A was not given one chooses it at random (and if x0 exists it must be 2). */ if (batch != 0) { if (method != ECM_ECM) { fprintf (stderr, "Error, the -batch option is only valid for ECM\n"); exit (EXIT_FAILURE); } mpz_set_ui (sigma, 0); if (mpz_sgn (orig_x0) == 0) mpz_set_ui (orig_x0, 2); else if (mpz_cmp_ui (orig_x0, 2) != 0) { fprintf (stderr, "Error, x0 should be equal to 2" " in batch mode.\n"); exit (EXIT_FAILURE); } mpz_set (x, orig_x0); } params->batch = batch; if (params->batch != 0 && params->batch_B1 != B1) { int st; params->batch_B1 = B1; if (verbose > OUTPUT_NORMAL) printf ("Batch mode %d: ", batch); st = cputime (); /* construct the batch exponent */ if (loadfile_s != NULL) { /* For now, there is no check that it correspond to the actual B1*/ read_s_from_file (params->batch_s, loadfile_s); if (verbose > OUTPUT_NORMAL) printf ("reading prime product of %zu bits took %ldms\n", mpz_sizeinbase (params->batch_s, 2), cputime () - st); } else { compute_s (params->batch_s, params->batch_B1); if (verbose > OUTPUT_NORMAL) printf ("computing prime product of %zu bits took %ldms\n", mpz_sizeinbase (params->batch_s, 2), cputime () - st); if (savefile_s != NULL) { int ret = write_s_in_file (savefile_s, params->batch_s); if (verbose > OUTPUT_NORMAL && ret > 0) printf ("Save s (%u bytes) in %s.\n", ret, savefile_s); } } } /* set parameters that may change from one curve to another */ params->method = method; /* may change with resume */ mpz_set (params->x, x); /* may change with resume */ /* if sigma is zero, then we use the A value instead */ params->sigma_is_A = ((mpz_sgn (sigma) == 0 || batch != 0) ? 1 : 0); mpz_set (params->sigma, (params->sigma_is_A) ? A : sigma); mpz_set (params->go, go.Candi.n); /* may change if contains N */ mpz_set (params->B2min, B2min); /* may change with -c */ /* Here's an ugly hack to pass B2scale to the library somehow. It gets piggy-backed onto B1done */ params->B1done = params->B1done + floor (B2scale * 128.) / 134217728.; /* Default, for P-1/P+1 with old stage 2 and ECM, use NTT only for small input */ if (use_ntt == 1 && (method == ECM_ECM || S != ECM_DEFAULT_S)) params->use_ntt = (mpz_size (n.n) <= NTT_SIZE_THRESHOLD); else params->use_ntt = use_ntt; #ifdef HAVE_GWNUM /* check if the input number can be represented as k*b^n+c */ if (kbnc_z (&gw_k, &gw_b, &gw_n, &gw_c, n.n)) { params->gw_k = gw_k; params->gw_b = gw_b; params->gw_n = gw_n; params->gw_c = gw_c; if (verbose > OUTPUT_NORMAL) printf ("Found number: %.0f*%lu^%lu + %ld\n", gw_k, gw_b, gw_n, gw_c); } else if (kbnc_str (&gw_k, &gw_b, &gw_n, &gw_c, n.cpExpr, n.n)) { params->gw_k = gw_k; params->gw_b = gw_b; params->gw_n = gw_n; params->gw_c = gw_c; if (verbose > OUTPUT_NORMAL) printf ("Found number: %.0f*%lu^%lu + %ld\n", gw_k, gw_b, gw_n, gw_c); } else { if (verbose > OUTPUT_NORMAL) printf ("Did not find a gwnum poly for the input number.\n"); } #endif #ifdef WANT_SHELLCMD /* See if the system is currently idle, if -idlecmd was given */ if (idlecmd != NULL) { int r; FILE *fc; fc = popen (idlecmd, "r"); if (fc == NULL) { fprintf (stderr, "Error executing idle command: %s\n", idlecmd); exit (EXIT_FAILURE); } r = pclose (fc); if (r != 0) /* If exit status of idle command is non-zero */ { printf ("Idle command returned %d, exiting\n", r); breadthfirst = 0; /* Avoid looping due to goto (ugly, FIXME!) */ break; } } #endif /* WANT_SHELLCMD */ if (timestamp) { time_t t; t = time (NULL); printf ("[%.24s]\n", ctime (&t)); } #if 0 /* Test mpres_muldivbysomething_si() which is not called in normal operation */ mpmod_selftest (n.n); #endif if (mpz_cmp_ui (n.n, 0) <= 0) { fprintf (stderr, "Error, input number should be positive\n"); exit (EXIT_FAILURE); } /* now call the ecm library */ result = ecm_factor (f, n.n, B1, params); if (result == ECM_ERROR) { fprintf (stderr, "Please report internal errors at <%s>.\n", PACKAGE_BUGREPORT); exit (EXIT_FAILURE); } if (result == ECM_NO_FACTOR_FOUND) { if (trial_factor_found) { factor_is_prime = 1; mpz_set_ui (f, 1); returncode = ECM_NO_FACTOR_FOUND; goto OutputFactorStuff; } } else { factsfound++; if (verbose > 0) printf ("********** Factor found in step %u: ", ABS (result)); mpz_out_str (stdout, 10, f); if (verbose > 0) printf ("\n"); /* Complain about non-proper factors (0, negative) */ if (mpz_cmp_ui (f, 1) < 0) { fprintf (stderr, "Error: factor found is "); mpz_out_str (stderr, 10, f); fprintf (stderr, "\nPlease report internal errors at <%s>.\n", PACKAGE_BUGREPORT); exit (EXIT_FAILURE); } #ifdef WANT_SHELLCMD if (faccmd != NULL) { FILE *fc; fc = popen (faccmd, "w"); if (fc != NULL) { mpz_t cof; mpz_init_set (cof, n.n); mpz_divexact (cof, cof, f); gmp_fprintf (fc, "%Zd\n", n.n); gmp_fprintf (fc, "%Zd\n", f); gmp_fprintf (fc, "%Zd\n", cof); mpz_clear (cof); pclose (fc); } } #endif if (mpz_cmp (f, n.n) != 0) { /* prints factor found and cofactor on standard output. */ factor_is_prime = probab_prime_p (f, PROBAB_PRIME_TESTS); if (verbose >= 1) { printf ("Found %s factor of %2u digits: ", factor_is_prime ? "probable prime" : "composite", nb_digits (f)); mpz_out_str (stdout, 10, f); printf ("\n"); } mpcandi_t_addfoundfactor (&n, f, 1); /* 1 for display warning if factor does not divide the current candidate */ if (resumefile != NULL) { /* If we are resuming from a save file, add factor to the discovered factors for the current number */ mpz_mul (resume_lastfac, resume_lastfac, f); resume_wasPrp = n.isPrp; } if (factor_is_prime) returncode = (n.isPrp) ? ECM_PRIME_FAC_PRIME_COFAC : ECM_PRIME_FAC_COMP_COFAC; else returncode = (n.isPrp) ? ECM_COMP_FAC_PRIME_COFAC : ECM_COMP_FAC_COMP_COFAC; OutputFactorStuff:; if (verbose >= 1) { printf ("%s cofactor ", n.isPrp ? "Probable prime" : "Composite"); if (n.cpExpr && !decimal_cofactor) printf ("%s", n.cpExpr); else mpz_out_str (stdout, 10, n.n); printf (" has %u digits\n", n.ndigits); } else /* quiet mode: just print a space here, remaining cofactor will be printed after last curve */ printf (" "); /* check for champions (top ten for each method) */ method1 = ((method == ECM_PP1) && (result < 0)) ? ECM_PM1 : method; if ((verbose > 0) && factor_is_prime && nb_digits (f) >= champion_digits[method1]) { printf ("Report your potential champion to %s\n", champion_keeper[method1]); printf ("(see %s)\n", champion_url[method1]); } /* Take care of fully factoring this number, in case we are in deep mode */ if (n.isPrp) cnt = 0; /* no more curve to perform */ if (!deep) { if (breadthfirst) /* I know it may not be prp, but setting this will cause all future loops to NOT check this candidate again */ pCandidates[linenum-1].isPrp = 1; cnt = 0; } else if (breadthfirst) mpcandi_t_copy (&pCandidates[linenum-1], &n); } else { if (breadthfirst) /* I know it may not be prp, but setting this will cause all future loops to NOT check this candidate again */ pCandidates[linenum-1].isPrp = 1; cnt = 0; /* no more curve to perform */ if (verbose > 0) printf ("Found input number N"); printf ("\n"); returncode = ECM_INPUT_NUMBER_FOUND; } fflush (stdout); } /* if quiet mode, prints remaining cofactor after last curve */ if ((cnt == 0) && (verbose == 0)) { if (n.cpExpr && !decimal_cofactor) printf ("%s", n.cpExpr); else mpz_out_str (stdout, 10, n.n); putchar ('\n'); fflush (stdout); } /* Write composite cofactors to savefile if requested */ /* If no factor was found, we consider cofactor composite and write it */ if (savefilename != NULL && !n.isPrp) { mpz_mod (x, params->x, n.n); /* Reduce stage 1 residue wrt new co- factor, in case a factor was found */ /* We write the B1done value to the safe file. This requires that a correct B1done is returned by the factoring functions */ write_resumefile_line (savefilename, method, params->B1done, sigma, A, x, &n, orig_x0, comment); } /* advance B1, if autoincrement value had been set during command line parsing */ if (!breadthfirst && autoincrementB1 > 0.0) { double NewB1; NewB1 = calc_B1_AutoIncrement (B1, autoincrementB1, autoincrementB1_calc); if (mpz_cmp_d (B2min, B1) <= 0) /* <= might be better than == */ mpz_set_d (B2min, NewB1); B1 = NewB1; } } /* Allow our "breadthfirst" search to re-run the file again if enough curves have not yet been run */ if (breadthfirst == 1 && !exit_asap_value) goto BreadthFirstDoAgain; /* NOTE finding a factor may have caused the loop to exit, but what is left on screen is the wrong count of factors (missing the just found factor. Update the screen to at least specify the current count */ if (infilename) /* infile might be stdin, don't fclose that! */ fclose (infile); if (resumefile) { fclose (resumefile); mpz_clear (resume_lastN); mpz_clear (resume_lastfac); } if (nCandidates) { while (nCandidates--) mpcandi_t_free (&pCandidates[nCandidates]); free (pCandidates); } free_expr (); gmp_randclear (randstate); mpz_clear (orig_x0); mpz_clear (startingB2min); mpz_clear (B2min); mpz_clear (B2); mpz_clear (x); mpz_clear (f); mpcandi_t_free (&n); mpz_clear (sigma); mpz_clear (A); mpq_clear (rat_x0); mpz_clear (seed); mpgocandi_t_free (&go); ecm_clear (params); #ifdef MEMORY_DEBUG tests_memory_end (); #endif /* exit 0 if a factor was found for the last input, except if we exit due to a signal */ #ifdef HAVE_SIGNAL if (returncode == 0 && exit_asap_value != 0) returncode = 143; #endif return returncode; } ecm-6.4.4/ecm-params.h.hppa2.00000644023561000001540000000172012106741273012563 00000000000000/* those parameters were obtained on gcc61.fsffrance.org with ecm-6.4.1-rc3 gmp-5.0.2, and gcc 4.4.1 -O2 -pedantic -mpa-risc-1-1 (note that GMP must be configured with ABI=1.0, see http://gmplib.org/list-archives/gmp-bugs/2009-August/001585.html */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1} #define MPZMOD_THRESHOLD 49 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 262144 #define PREREVERTDIVISION_NTT_THRESHOLD 262144 #define POLYINVERT_NTT_THRESHOLD 262144 #define POLYEVALT_NTT_THRESHOLD 262144 #define MPZSPV_NORMALISE_STRIDE 256 ecm-6.4.4/polyeval.c0000644023561000001540000002362212106741274011223 00000000000000/* Implements algorithm polyeval and remainder tree using middle product. Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2009 Laurent Fousse, Alexander Kruppa, Paul Zimmermann. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include /* for strlen */ #include "ecm-impl.h" #ifdef HAVE_UNISTD_H # include /* for unlink */ #endif #ifndef MAX #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #endif /* #define DEBUG_TREEDATA */ extern unsigned int Fermat; /* algorithm polyeval from section 3.7 of Peter Montgomery's dissertation. Input: G - an array of k elements of R, G[i], 0 <= i < k representing the coefficients of a polynomial G(x) of degree < k Tree - the product tree produced by PolyFromRoots Tree[0][0..k-1] (degree k/2) Tree[1][0..k-1] (degree k/4), ..., Tree[lgk-1][0..k-1] (degree 1) Output: the sequence of values of G(a[i]) are stored in G[i] for 0 <= i < k Remark: we need an auxiliary (k+1)-th cell G[k] in G. The memory used is M(k) = max(3*floor(k/2)+list_mul_mem(floor(k/2)), k+list_mul_mem(ceil(k/2)), floor(k/2) + M(ceil(k/2))). Since list_mul_mem(k) >= 2*k, the maximum is the 1st. */ void polyeval (listz_t G, unsigned int k, listz_t *Tree, listz_t T, mpz_t n, unsigned int sh) { unsigned int l, m; listz_t T0; if (k == 1) return; T0 = Tree[0] + sh; m = k / 2; l = k - m; /* divide G[0]+G[1]*x+...+G[k-1]*x^(k-1) by T0[l]+...+T0[k-1]*x^(m-1)+x^m, quotient in {T+m,l-1}, remainder in {T,m} */ if (k == 2 * m) { /* FIXME: avoid the copy here by giving different 2nd and 3rd arguments to RecursiveDivision */ list_set (T, G, k); /* the following needs k+m+list_mul_mem(m) in T */ RecursiveDivision (T + k, T, T0 + l, m, T + k + m, n, 1); } else /* k = 2m+1: subtract G[k-1]*x^(l-1) * T0 from G */ { /* G - G[k-1] * (x^m + {T0+l,m}) * x^m */ list_set (T, G, m); list_mul_z (T + m, T0 + l, G[k - 1], m, n); list_sub (T + m, G + m, T + m, m); /* the following needs 3m+list_mul_mem(m) in T */ RecursiveDivision (T + 2 * m, T, T0 + l, m, T + 3 * m, n, 1); } /* in both cases we need 3*(k/2)+list_mul_mem(k/2) */ /* right remainder is in {T,m} */ /* k = 2l or k = 2l-1 */ /* divide G[0]+G[1]*x+...+G[k-1]*x^(k-1) by T0[0]+...+T0[l-1]*x^(l-1)+x^l: quotient in {T+m,m-1}, remainder in {G,l} */ if (k < 2 * l) mpz_set_ui (G[k], 0); /* the following needs k+list_mul_mem(l) in T */ RecursiveDivision (T + m, G, T0, l, T + k, n, 1); /* left remainder is in {G,l} */ polyeval (G, l, Tree + 1, T + m, n, sh); /* copy right remainder in {G+l,m} */ list_set (G + l, T, m); polyeval (G + l, m, Tree + 1, T, n, sh + l); } #if defined(DEBUG) || defined(DEBUG_TREEDATA) void print_vect (listz_t t, unsigned int l) { unsigned int i; fprintf (ECM_STDOUT, "["); for (i = 0; i < l; i++) { mpz_out_str (ECM_STDOUT, 10, t[i]); if (i != l - 1) fprintf (ECM_STDOUT, ", "); else fprintf (ECM_STDOUT, "]"); } } #endif /* Computes TUpTree as described in ref[1]. k is the degree of the * polynomial at the root of the tree. sh is the shift we need to * apply to find the actual coefficients of the polynomial at the root * of the tree. */ void TUpTree (listz_t b, listz_t *Tree, unsigned int k, listz_t tmp, int dolvl, unsigned int sh, mpz_t n, FILE *TreeFile) { unsigned int m, l; m = k / 2; l = k - m; if (k == 1) return; #ifdef DEBUG fprintf (ECM_STDOUT, "In TupTree, k = %d.\n", k); fprintf (ECM_STDOUT, "b = "); print_vect (b, k); fprintf (ECM_STDOUT, "\nThe polynomials at that level are: "); print_vect (Tree[0] + sh, k); fprintf (ECM_STDOUT, "\n"); #endif if (dolvl == 0 || dolvl == -1) { if (TreeFile != NULL) { list_inp_raw (tmp + k, TreeFile, l); #ifdef DEBUG_TREEDATA printf ("Read from file: "); print_vect (tmp + k, l); #endif TMulGen (tmp + l, m - 1, tmp + k, l - 1, b, k - 1, tmp + k + l, n); list_inp_raw (tmp + k, TreeFile, m); #ifdef DEBUG_TREEDATA print_vect (tmp + k, m); printf ("\n"); #endif TMulGen (tmp, l - 1, tmp + k, m - 1, b, k - 1, tmp + k + m, n); } else { #ifdef DEBUG_TREEDATA printf ("Got from Tree: "); print_vect (Tree[0] + sh, l); print_vect (Tree[0] + sh + l, m); printf ("\n"); #endif TMulGen (tmp + l, m - 1, Tree[0] + sh, l - 1, b, k - 1, tmp + k, n); TMulGen (tmp, l - 1, Tree[0] + sh + l, m - 1, b, k - 1, tmp + k, n); } #if defined(DEBUG) || defined (DEBUG_TREEDATA) fprintf (ECM_STDOUT, "And the result at that level (before correction) is:"); print_vect (tmp, k); fprintf (ECM_STDOUT, "\n"); #endif /* GMP-ECM specific: leading coefficients in the product tree * are implicit ones, so we need some extra work here. */ list_add (tmp, tmp, b + m, l); list_add (tmp + l, tmp + l, b + l, m); list_mod (b, tmp, k, n); /* reduce both parts simultaneously */ #ifdef DEBUG fprintf (ECM_STDOUT, "And the result at this level is:"); print_vect (b, k); fprintf (ECM_STDOUT, "\n"); #endif } if (dolvl > 0 || dolvl == -1) { if (dolvl > 0) dolvl--; TUpTree (b, Tree + 1, l, tmp, dolvl, sh, n, TreeFile); TUpTree (b + l, Tree + 1, m, tmp, dolvl, sh + l, n, TreeFile); } } static unsigned int TUpTree_space (unsigned int k) { unsigned int m, l; unsigned int r1, r2; m = k / 2; l = k - m; if (k == 1) return 0; r1 = TMulGen_space (l - 1, m - 1, k - 1) + l; if (m != l) { r2 = TMulGen_space (m - 1, l - 1, k - 1) + k; r1 = MAX (r1, r2); } r2 = TUpTree_space (l); r1 = MAX (r1, r2); if (m != l) { r2 = TUpTree_space (m); r1 = MAX (r1, r2); } return r1; } /* Same as polyeval. Needs invF as extra argument. Return non-zero iff an error occurred. */ int polyeval_tellegen (listz_t b, unsigned int k, listz_t *Tree, listz_t tmp, unsigned int sizeT, listz_t invF, mpz_t n, char *TreeFilename) { unsigned int tupspace; unsigned int tkspace; int allocated = 0, r = 0; /* return value, 0 = no error */ listz_t T; ASSERT(Tree != NULL || TreeFilename != NULL); tupspace = TUpTree_space (k) + k; #ifndef USE_SHORT_PRODUCT tkspace = TMulGen_space (k - 1, k - 1, k - 1) + k; #else tkspace = 2 * k - 1 + list_mul_mem (k); #endif tupspace = MAX (tupspace, tkspace); if (TreeFilename != NULL) tupspace += (k + 1) / 2; if (sizeT >= tupspace) T = tmp; else { outputf (OUTPUT_DEVVERBOSE, "polyeval_tellegen: allocating extra temp" " space, want %d but T has only %d\n", tupspace, sizeT); MEMORY_TAG; T = init_list (tupspace); MEMORY_UNTAG; if (T == NULL) return ECM_ERROR; allocated = 1; } #ifdef TELLEGEN_DEBUG fprintf (ECM_STDOUT, "In polyeval_tellegen, k = %d.\n", k); fprintf (ECM_STDOUT, "Required memory: %d.\n", TMulGen_space (k - 1, k - 1, k - 1)); #endif if (Fermat) { /* Schoenhage-Strassen can't do a half product faster than a full */ F_mul (T, invF, b, k, DEFAULT, Fermat, T + 2 * k); list_mod (T, T + k - 1, k, n); } else { #ifdef USE_SHORT_PRODUCT /* need space 2k-1+list_mul_mem(k) in T */ list_mul_high (T, invF, b, k, T + 2 * k - 1); list_mod (T, T + k - 1, k, n); #else /* revert invF for call to TMulGen below */ list_revert (invF, k); TMulGen (T, k - 1, invF, k - 1, b, k - 1, T + k, n); #endif } list_revert (T, k); if (TreeFilename != NULL) { unsigned int lgk, i; FILE *TreeFile; char *fullname = (char *) malloc (strlen (TreeFilename) + 1 + 2 + 1); if (fullname == NULL) { fprintf (stderr, "Cannot allocate memory in polyeval_tellegen\n"); exit (1); } lgk = ceil_log2 (k); for (i = 0; i < lgk; i++) { sprintf (fullname, "%s.%d", TreeFilename, i); TreeFile = fopen (fullname, "rb"); if (TreeFile == NULL) { outputf (OUTPUT_ERROR, "Error opening file %s for product tree of F\n", fullname); r = ECM_ERROR; goto clear_T; } TUpTree (T, NULL, k, T + k, i, 0, n, TreeFile); fclose (TreeFile); unlink (fullname); } free (fullname); } else TUpTree (T, Tree, k, T + k, -1, 0, n, NULL); list_swap (b, T, k); /* more efficient than list_set, since T is not needed anymore */ clear_T: if (allocated) clear_list (T, tupspace); return r; } ecm-6.4.4/mul_fft-params.h.default0000644023561000001540000000017612106741273013733 00000000000000/* Empty file so that #include won't produce an error message. With no parameters defined, mul_fft.c will use defaults. */ ecm-6.4.4/m4/0000755023561000001540000000000012113421640007605 500000000000000ecm-6.4.4/configure.in0000644023561000001540000006037212113201454011525 00000000000000m4_define([ECM_VERSION], [6.4.4]) AC_PREREQ([2.57]) AC_INIT([ecm], ECM_VERSION, [ecm-discuss@lists.gforge.inria.fr]) AC_CONFIG_HEADER([config.h]) AC_CONFIG_MACRO_DIR([m4]) GMP_INIT([config.m4]) AM_INIT_AUTOMAKE([1.10]) AM_INIT_AUTOMAKE([ecm], ECM_VERSION) AC_CANONICAL_HOST dnl Copied from MPFR 2.4.2: unset GMP_CFLAGS GMP_CC user_redefine_cc dnl Check if user request his CC and CFLAGS if test -n "$CFLAGS" || test -n "$CC" ; then user_redefine_cc=yes fi dnl the following is required to compile auxi.c according to autoconf 2.61 AC_PROG_EGREP AC_PROG_SED AC_ARG_WITH([gmp], [ --with-gmp=DIR GMP install directory ], [with_gmp_include=$withval/include with_gmp_lib=$withval/lib]) AC_ARG_WITH([gmp_include], [ --with-gmp-include=DIR GMP include directory ], [with_gmp_include=$withval]) AC_ARG_WITH([gmp_lib], [ --with-gmp-lib=DIR GMP lib directory ], [with_gmp_lib=$withval]) AC_ARG_WITH([gwnum], [ --with-gwnum=DIR GWNUM source directory ], [with_gwnum=$withval]) if test x"$with_gmp_include" != "x" then if ! test -d "$with_gmp_include" then AC_MSG_ERROR([Specified GMP include directory "$with_gmp_include" does not exist]) fi CPPFLAGS="-I$with_gmp_include" fi if test x"$with_gmp_lib" != "x" then if ! test -d "$with_gmp_lib" then AC_MSG_ERROR([Specified GMP library directory "$with_gmp_lib" does not exist]) fi fi ############################ # Parse --enable-* options # ############################ dnl Assertions are enabled by default for beta/rc releases. The last parameter dnl of AC_ARG_ENABLE() sets the default value (change also default=...). AC_ARG_ENABLE([assert], [AS_HELP_STRING([--enable-assert], [enable ASSERT checking [[default=no]]])],[],[enable_assert=no]) if test "x$enable_assert" = xyes; then AC_DEFINE([WANT_ASSERT],1,[Define to 1 if you want assertions enabled]) GMP_DEFINE([WANT_ASSERT], 1) fi AC_ARG_ENABLE([shellcmd], [AS_HELP_STRING([--enable-shellcmd], [enable shell command execution [[default=no]]])]) if test "x$enable_shellcmd" = xyes; then AC_DEFINE([WANT_SHELLCMD],1,[Define to 1 if you want shell command execution]) fi AC_ARG_ENABLE([gmp-cflags], [AS_HELP_STRING([--enable-gmp-cflags], [enable importing CFLAGS from gmp.h [[default=yes]]])],[],[enable_gmp_cflags=yes]) AC_ARG_ENABLE([openmp], [AS_HELP_STRING([--enable-openmp], [enable OpenMP multi-threading [[default=no]]])]) AC_ARG_ENABLE([sse2], [AS_HELP_STRING([--enable-sse2], [use SSE2 instructions in NTT code (default=yes for 32-bit x86 systems, if supported)])]) AC_ARG_ENABLE([asm-redc], [AS_HELP_STRING([--enable-asm-redc], [use an asm redc (default=yes on x86_64 and powerpc64, no on others)])]) AC_ARG_ENABLE([memory-debug], [AS_HELP_STRING([--enable-memory-debug], [enable memory debugging [[default=no]]])]) if test "x$enable_memory_debug" = xyes; then AC_DEFINE([MEMORY_DEBUG], 1, [Define to 1 if you want memory debugging]) fi AM_CONDITIONAL([MEMORY_DEBUG], [test "x$enable_memory_debug" = xyes]) AC_ARG_ENABLE([mulredc-svoboda], [AS_HELP_STRING([--enable-mulredc-svoboda], [enable Svoboda mulredc [[default=no]]])]) if test "x$enable_mulredc_svoboda" = xyes; then AC_DEFINE([MULREDC_SVOBODA],1,[Define to 1 if you want Svoboda mulredc]) GMP_DEFINE([MULREDC_SVOBODA], 1) fi dnl Use C language for test programs AC_LANG([C]) dnl Copied from MPFR 2.4.1 and modified dnl We need to guess the C preprocessor instead of using AC_PROG_CPP, dnl since AC_PROG_CPP implies AC_PROG_CC, which chooses a compiler dnl (before we have the chance to get it from gmp.h) and does some dnl checking related to this compiler (such as dependency tracking dnl options); if the compiler changes due to __GMP_CC in gmp.h, one dnl would have incorrect settings. dnl FIXME: Move this in aclocal ? if test "x$user_redefine_cc" = x && test "x$enable_gmp_cflags" = xyes && test "x$cross_compiling" != xyes; then if test "x$GMP_CC$GMP_CFLAGS" = x; then AC_MSG_CHECKING([for CC and CFLAGS in gmp.h]) GMP_CC=__GMP_CC GMP_CFLAGS=__GMP_CFLAGS for cpp in /lib/cpp gcc cc c99 do test $cpp = /lib/cpp || cpp="$cpp -E" echo "Trying to run $cpp" >&AS_MESSAGE_LOG_FD AC_LANG_CONFTEST([AC_LANG_SOURCE([foo])]) if $cpp $CPPFLAGS conftest.$ac_ext > /dev/null 2> /dev/null ; then # Get CC and CFLAGS AC_LANG_CONFTEST([AC_LANG_SOURCE([#include "gmp.h" MPFR_OPTION_CC __GMP_CC MPFR_OPTION_CFLAGS __GMP_CFLAGS])]) echo "Trying to parse gmp.h with: $cpp $CPPFLAGS conftest.$ac_ext" >&AS_MESSAGE_LOG_FD if $cpp $CPPFLAGS conftest.$ac_ext 2> /dev/null > conftest.out; then GMP_CC="`$EGREP MPFR_OPTION_CC conftest.out | $SED -e 's/MPFR_OPTION_CC //g' | $SED -e 's/"//g'`" GMP_CFLAGS="`$EGREP MPFR_OPTION_CFLAGS conftest.out | $SED -e 's/MPFR_OPTION_CFLAGS //g'| $SED -e 's/"//g'`" echo "Success, GMP_CC=$GMP_CC, GMP_CFLAGS=$GMP_CFLAGS" >&AS_MESSAGE_LOG_FD break else echo "Could not parse gmp.h with $cpp" >&AS_MESSAGE_LOG_FD fi else echo "Could not run $cpp" >&AS_MESSAGE_LOG_FD fi done rm -f conftest* if test "x$GMP_CC" = "x__GMP_CC" || test "x$GMP_CFLAGS" = "x__GMP_CFLAGS" ; then AC_MSG_RESULT([no]) GMP_CFLAGS= GMP_CC= else AC_MSG_RESULT([yes CC=$GMP_CC CFLAGS=$GMP_CFLAGS]) fi fi dnl But these variables may be invalid, so we must check them first. dnl Note: we do not use AC_RUN_IFELSE, as it implies AC_PROG_CC. if test "x$GMP_CC$GMP_CFLAGS" != x; then AC_MSG_CHECKING([whether CC=$GMP_CC and CFLAGS=$GMP_CFLAGS works]) AC_LANG_CONFTEST([AC_LANG_SOURCE([int main (void) { return 0; }])]) if $GMP_CC $GMP_CFLAGS -o conftest conftest.$ac_ext 2> /dev/null ; then AC_MSG_RESULT([yes]) CFLAGS=$GMP_CFLAGS CC=$GMP_CC else AC_MSG_RESULT([no, reverting to default]) fi rm -f conftest* fi fi dnl Checks for programs. AC_PROG_CC AM_PROG_AS AM_PROG_CC_C_O # Now that we have decided on CC and CFLAGS, init libtool # Don't make a shared library by default. Enable building a shared library # by specifying "--enable-shared" on the ./configure command line LT_PREREQ([2.2.6]) LT_INIT([disable-shared]) dnl Checks for typedefs, structures, and compiler characteristics. AC_TYPE_INT64_T AC_TYPE_UINT64_T AC_TYPE_LONG_LONG_INT dnl Check if compiler supports "const," if not define it to empty string AC_C_CONST dnl Check if compiler supports "inline," if not define it to dnl __inline__ or __inline or the empty string AC_C_INLINE dnl Check if both time.h and sys/time.h can be included AC_HEADER_TIME dnl Define size_t to something useable if standard headers don't AC_TYPE_SIZE_T dnl If OpenMP is enabled, check which command line parameter (if any) dnl if required to make the compiler enable OpenMP if test "x$enable_openmp" = xyes; then AC_OPENMP fi ######################## # Enable asm redc code # ######################## # If --(en|dis)able-asm-redc not specified, choose default value # Test if asm redc code is available for this cpu. # Point ASMPATH to the correct subdirectory. # asm_redc enabled by default for x86_64 and 64 bit PowerPC if test "x$enable_asm_redc" = x; then case $host in x86_64*-*-* | powerpc-apple-darwin* | powerpc64-*-linux*) enable_asm_redc=yes;; *) enable_asm_redc=no;; esac fi if test "x$enable_asm_redc" = xyes; then case $host in pentium4-*-* | pentium3-*-* | viac7-*-* | i786-*-*) ASMPATH=pentium4;; x86_64*-*-*) # In case GMP has been compiled with a 32-bit ABI... # Use AC_COMPILE_IFELSE instead of AC_PREPROC_IFELSE, otherwise # GMP's CFLAGS doesn't seem to be taken into account. AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #if defined(__i386__) #error #endif])], [], [AC_MSG_NOTICE([32-bit ABI (i386), disabling asm-redc]) enable_asm_redc=no]) ASMPATH=x86_64;; # warning: with powerpc-apple-darwin* we can have ABI=32 # see bug #10646 on the bug tracker, where config.guess says # powerpc-apple-darwin8.11.0 (this a 64-bit machine, but most applications # are compiled in 32 bits). It works with --disable-asm-redc. powerpc-apple-darwin*) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #if defined(__ppc__) #error #endif])], [], [AC_MSG_NOTICE([32-bit PowerPC, disabling asm-redc]) enable_asm_redc=no]) ASMPATH=powerpc64;; powerpc64-*-linux*) ECM_INCLUDE([<"$srcdir"/powerpc64/powerpc-defs.m4>]) ASMPATH=powerpc64;; i[[56]]86-*-* | k[[78]]*-*-* | athlon*-*-* | pentiumpro-*-* | \ pentium2-*-* | viac3*-*-* | i686-apple-darwin*) ASMPATH=athlon;; *) AC_MSG_ERROR([[asm redc not available on this machine $host]]);; esac fi if test "x$enable_asm_redc" = xyes; then # do the necessary definitions and includes AC_DEFINE([USE_ASM_REDC],1,[Define to 1 to use asm redc]) test "x$CCAS" != x || CCAS="$CC -c" AC_SUBST([CCAS]) GMP_PROG_M4 GMP_ASM_UNDERSCORE GMP_ASM_TEXT GMP_ASM_GLOBL GMP_ASM_TYPE case $host in *-*-mingw32) GMP_DEFINE([WINDOWS64_ABI], 1) AC_DEFINE([WINDOWS64_ABI], 1,[Define to 1 if x86_64 mulredc*() functions should be called with Windows ABI]);; *) ;; esac case $host in pentium3-*-*) echo "WARNING: Your processor is recognized as Pentium3." echo " The asm code uses SSE2, and therefore it might" echo " fail if your proc is indeed a P3, and not a" echo " Pentium M. If you have compilation problems," echo " consider using --disable-asm-redc." ;; *) esac fi AM_CONDITIONAL([ENABLE_ASM_REDC], [test "x$enable_asm_redc" = xyes]) ############################ # Enable SSE2 instructions # ############################ # Test if we should use SSE2 instructions and if the cpu supports them if test "x$enable_sse2" = "x"; then dnl Default: if we build for Pentium 4, enable SSE2 code for the NTT dnl Some SSE2 enabled cpus are identified as i686, we enables SSE2 dnl for them by default and let the tests below check if it works case $host in pentium4-*-* | viac7-*-* | i686-*-* | i786-*-*) enable_sse2=yes ;; esac fi # If the necessary predefines (__GNUC__ or __ICL, and __i386__) # are not set, SSE2 will never be compiled in, and we switch off # the SSE2 flag if test "x$enable_sse2" = xyes; then AC_PREPROC_IFELSE([AC_LANG_PROGRAM([ #if !defined(__GNUC__) && !defined(__ICL) || !defined(__i386__) #error #IRIXdoesnotexitaterrordirective #endif])], [], dnl Necessary predefines are present. Nothing to do [ dnl Necessary predefines are not present. Switch off SSE2 AC_MSG_NOTICE([Not using GCC or ICC, or not a 32-bit x86. SSE2 disabled]) enable_sse2=no]) fi if test "x$enable_sse2" = xyes; then # See if we need -msse2 to enable SSE2 instructions AC_MSG_CHECKING([for SSE2 support]) m4_define([SSE2_TEST_PROG], [AC_LANG_PROGRAM([], dnl [#if (defined(__GNUC__) || defined(__ICL)) && defined(__i386__) /* On some machines, a program without constraints may pass without -msse2 but those with constraints in spv.c fail, thus we test with constraints here. */ asm volatile ("pmuludq %%xmm2, %%xmm0" : : :"%xmm0"); #else #error #IRIXdoesnotexitaterrordirective #endif])]) AC_COMPILE_IFELSE([SSE2_TEST_PROG], dnl [ dnl SSE2 works, nothing to be done AC_MSG_RESULT([yes])], dnl [ dnl SSE2 does not work, try again with -msse2 OLDCFLAGS="$CFLAGS" CFLAGS="$CFLAGS -msse2" AC_COMPILE_IFELSE([SSE2_TEST_PROG], dnl [ dnl works now, keep CFLAGS like this AC_MSG_RESULT([yes, with -msse2])], dnl [ dnl still didn't work, switch off SSE2 CFLAGS="$OLDCFLAGS" enable_sse2=no AC_MSG_RESULT([not supported, SSE2 disabled]) ])]) fi if test "x$enable_sse2" = xyes; then AC_DEFINE([HAVE_SSE2],1,[Define to 1 to enable SSE2 instructions in NTT code]) fi ######################## # Add GWNUM if desired # ######################## dnl If user wants GWNUM, check if the file exists (either as .a or .lib) if test "x$with_gwnum" != "x"; then if test "x$enable_openmp" = xyes; then AC_MSG_ERROR([Woltman's GWNUM currently cannot be used together with OpenMP]) fi AC_CHECK_FILE([$with_gwnum/gwnum.a], [ AC_DEFINE([HAVE_GWNUM], 1, [Define to 1 if gwnum.a or gwnum.lib exist]) GWLIB="$with_gwnum/gwnum.a -lpthread" CPPFLAGS="$CPPFLAGS -I$with_gwnum" ],[ AC_CHECK_FILE([$with_gwnum/gwnum.lib], [ AC_DEFINE([HAVE_GWNUM], 1, [Define to 1 if gwnum.a or gwnum.lib exist]) GWLIB="$with_gwnum/gwnum.lib -lpthread" CPPFLAGS="$CPPFLAGS -I$with_gwnum" ],[ with_gwnum= AC_MSG_ERROR([Woltman's GWNUM library not found]) ]) ]) fi AM_CONDITIONAL([WITH_GWNUM], [test "x$with_gwnum" != "x"]) dnl Checks for header files. AC_FUNC_ALLOCA AC_HEADER_STDC AC_CHECK_HEADERS([math.h limits.h malloc.h strings.h sys/time.h unistd.h io.h signal.h fcntl.h]) AC_CHECK_HEADERS([windows.h]) AC_CHECK_HEADERS([ctype.h sys/types.h sys/resource.h]) dnl Checks for library functions that are not in GMP AC_FUNC_STRTOD dnl Check functions in the math library AC_CHECK_LIB(m,pow,,AC_MSG_ERROR(required function missing)) AC_CHECK_LIB(m,floor,,AC_MSG_ERROR(required function missing)) AC_CHECK_LIB(m,sqrt,,AC_MSG_ERROR(required function missing)) AC_CHECK_LIB(m,fmod,,AC_MSG_ERROR(required function missing)) AC_CHECK_LIB(m,cos) dnl Check for GSL but don't add it to LIBS, since only rho uses it and dnl we don't want all other binaries to depend on it. dnl If found, pass link flags to Makefile via GSL_LD_FLAGS GSL_LD_FLAGS= dnl Check if "-lgslcblas" works. If yes, check if "-lgsl -lgslcblas" works. dnl If both work, put "-lgsl -lgslcblas" in GSL_LD_FLAGS AC_CHECK_LIB([gslcblas],[cblas_dgemm], dnl [AC_CHECK_LIB([gsl],[gsl_blas_dgemm], dnl [AC_DEFINE([HAVE_LIBGSL],1,[Define to 1 if you have the `gsl' library (-lgsl).]) AC_DEFINE([HAVE_LIBGSLCBLAS],1,[Define to 1 if you have the `gslcblas' library (-lgslcblas).]) GSL_LD_FLAGS="-lgsl -lgslcblas"], [], dnl dnl Here comes the "OTHER-LIBRARIES" field for AC_CHECK_LIB [gsl]: [-lgslcblas])]) AC_CHECK_FUNCS([isascii memset strchr strlen strncasecmp strstr], [], [AC_MSG_ERROR([required function missing])]) AC_CHECK_FUNCS([access unlink], [], [AC_MSG_ERROR([required function missing])]) AC_CHECK_FUNCS([isspace isdigit isxdigit], [], [AC_MSG_ERROR([required function missing])]) AC_CHECK_FUNCS([time ctime], [], [AC_MSG_ERROR([required function missing])]) AC_CHECK_FUNCS([setpriority nice gethostname gettimeofday getrusage memmove signal fcntl fileno]) dnl FIXME: which win32 library contains these functions? dnl AC_CHECK_FUNCS([GetCurrentProcess GetProcessTimes]) AC_CHECK_FUNCS([malloc_usable_size]) dnl If we use GCC and user has not specified his own CFLAGS, dnl add some warning flags, avoiding duplication if test "x$GCC" = xyes && test "x$user_redefine_cc" != xyes; then case $CFLAGS in "-pedantic "* | *" -pedantic "* | *" -pedantic") ;; *) CFLAGS="-pedantic $CFLAGS" esac case $CFLAGS in "-Wundef "* | *" -Wundef "* | *" -Wundef") ;; *) CFLAGS="-Wundef $CFLAGS" esac case $CFLAGS in "-Wall "* | *" -Wall "* | *" -Wall") ;; *) CFLAGS="-Wall $CFLAGS" esac case $CFLAGS in "-W "* | *" -W "* | *" -W") ;; *) CFLAGS="-W $CFLAGS" esac # CFLAGS="-Wall -W -Wundef -pedantic $CFLAGS" fi dnl Find GMP and do some sanity checks dnl Tests concerning the include directories. if test -d "$with_gmp_include"; then dnl AC_CHECK_HEADERS and AC_PREPROC_IFELSE uses CPPFLAGS but not CFLAGS CPPFLAGS="-I$with_gmp_include $CPPFLAGS" else with_gmp_include= fi AC_CHECK_HEADERS([gmp.h], [], AC_MSG_ERROR([required header file missing])) dnl This needs to find only the header file so we can do it here, before dnl we start looking for libgmp.a AC_MSG_CHECKING([for recent GMP]) AC_PREPROC_IFELSE([AC_LANG_SOURCE([[ #include #if (__GNU_MP_VERSION <= 4) #error #IRIXdoesnotexitaterrordirective #endif ]])], [AC_MSG_RESULT(yes)], [AC_MSG_RESULT(no) AC_MSG_ERROR([GMP 5.0.0 or newer is required])] ) dnl Figure out if GMP is MPIR AC_MSG_CHECKING([if GMP is MPIR]) AC_PREPROC_IFELSE([AC_LANG_SOURCE([[ #include #ifndef __MPIR_VERSION #error #IRIXdoesnotexitaterrordirective #endif ]])], [AC_MSG_RESULT(yes) AC_DEFINE([HAVE_MPIR],1,[Define to 1 if GMP is MPIR])], [AC_MSG_RESULT(no)]) dnl Now choose how to link the GMP library. If we can, we'd prefer to dnl link it statically since that makes for faster function calls. To dnl link it statically, we mustn't build a dynamic GMP-ECM library and dnl we need to find the libgmp.a file. At the moment, we only look for dnl it at the path specified by the user (i.e. --with-gmp) but make no dnl attempt to find it in the default system lib directories. dnl If GMP is linked statically, we pass its path/filename to the Makefile dnl via GMPLIB, otherwise -lgmp is passed via GMPLIB. dnl The search path to the dynamic GMP library is added to LDFLAGS, dnl if GMP is not specified by full pathname. GMPLDFLAGS="" if test -d "$with_gmp_lib"; then GMPLDFLAGS="-L$with_gmp_lib" fi GMPLIB="-lgmp" if test "x$enable_shared" != xyes; then if test -r "$with_gmp_lib/libgmp.a"; then GMPLIB="$with_gmp_lib/libgmp.a" dnl Don't need -L flag since we give full path to libgmp.a GMPLDFLAGS="" fi fi AC_SUBST([GMPLIB]) LDFLAGS="$LDFLAGS $GMPLDFLAGS" dnl Test linking against GMP. This tests, for example, that the compiler dnl and GMP agree on the ABI (32 or 64 bit). AC_CHECK_LIB() does not work, dnl as it requires linking the library with -lgmp, whereas we may want dnl to specify it by full pathname as we do in the Makefile AC_MSG_CHECKING([whether we can link against GMP]) LIBS_BACKUP="$LIBS" LIBS="$LIBS $GMPLIB" AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #ifdef HAVE_GMP_H #include #endif]],[[ mpz_t t; mpz_init(t); mpz_clear(t); return 0; ]])], [AC_MSG_RESULT(yes)], [AC_MSG_RESULT(no) AC_MSG_ERROR([Could not link against GMP library.])] ) dnl Check for corresponding 'gmp.h' and libgmp.a dnl This requires running a program linked against GMP, dnl so is done after the link-only test. AC_MSG_CHECKING([if gmp.h version and libgmp version are the same]) AC_RUN_IFELSE([AC_LANG_PROGRAM([[ #include #include #include ]],[[ char buffer[100]; if (__GNU_MP_VERSION == 4 && __GNU_MP_VERSION_MINOR <= 2 && __GNU_MP_VERSION_PATCHLEVEL == 0) sprintf (buffer, "%d.%d", __GNU_MP_VERSION, __GNU_MP_VERSION_MINOR); else sprintf (buffer, "%d.%d.%d", __GNU_MP_VERSION, __GNU_MP_VERSION_MINOR, __GNU_MP_VERSION_PATCHLEVEL); printf ("(%s/%s) ", buffer, gmp_version); return strcmp (buffer, gmp_version); ]])], [AC_MSG_RESULT([yes])], [AC_MSG_RESULT([no]) AC_MSG_ERROR(['gmp.h' and 'libgmp' have different versions, you have to reinstall GMP properly.])], [AC_MSG_RESULT([cross-compiling: cannot test])] ) AC_CHECK_FUNCS([__gmpn_add_nc __gmpn_mod_34lsub1 __gmpn_redc_1 __gmpn_redc_2]) AC_CHECK_FUNCS([__gmpn_mullo_n __gmpn_redc_n __gmpn_preinv_mod_1]) LIBS="$LIBS_BACKUP" dnl Check if the compiler understands some __attribute__ directives AC_MSG_CHECKING([whether compiler knows __attribute__((hot))]) dnl The AC_LANG_WERROR directive causes configure to consider a test dnl compilation unsuccessful if it produced any output on stderr. dnl We use it since unknown __attribute__ only cause a warning, not an dnl error. Unfortunately there seems to be no way to switch it off again, dnl so this test is at the end of the configure script AC_LANG_WERROR AC_COMPILE_IFELSE([AC_LANG_PROGRAM([void foo() __attribute__ ((hot)); void foo() {return;}], [foo()])], [AC_DEFINE([ATTRIBUTE_HOT],[__attribute__ ((hot))], [How to specify hot-spot attribute, if available]) AC_MSG_RESULT([yes]) ], [AC_DEFINE([ATTRIBUTE_HOT],[ ], [How to specify hot-spot attribute, if available]) AC_MSG_RESULT([no]) ]) dnl Check for xsltproc AC_CHECK_PROG([XSLTPROC],[xsltproc],[xsltproc]) if test "x$XSLTPROC" != x; then AC_MSG_CHECKING([for docbook.xsl]) if test "x$XSLDIR" = x; then if test -d "/usr/local/share/sgml/docbook/xsl-stylesheets"; then XSLDIR="/usr/local/share/sgml/docbook/xsl-stylesheets" elif test -d "/usr/share/sgml/docbook/xsl-stylesheets"; then XSLDIR="/usr/share/sgml/docbook/xsl-stylesheets" elif test -d "/usr/local/share/docbook/"; then XSLDIR="/usr/local/share/docbook/" elif test -d "/usr/share/docbook/"; then XSLDIR="/usr/share/docbook/" fi fi if test -r "$XSLDIR/manpages/docbook.xsl"; then AC_MSG_RESULT([yes]) make_manpage="yes" else AC_MSG_RESULT([no]) fi fi AM_CONDITIONAL([MAKE_MANPAGE], [test "x$make_manpage" = xyes]) dnl Check for valgrind. GMP-ECM uses exit code 1 to signal error, dnl so we make valgrind use that code to signal error, too AC_CHECK_PROG([VALGRIND], [valgrind], [valgrind -q --error-exitcode=1]) case $host in athlon*-*-*) config_arch="athlon" ;; x86_64*-*-*) config_arch="athlon64" ;; pentium3-*-*) config_arch="pentium3" ;; pentium4-*-*) config_arch="pentium4" ;; pentium-m-*-*) config_arch="pentium-m" ;; alphaev6*-*-*) config_arch="alpha-ev6" ;; alphaev56*-*-*) config_arch="alpha-ev56" ;; alphaev5*-*-*) config_arch="alpha-ev5" ;; powerpc7450-*-*) config_arch="powerpc7450" ;; powerpc-apple-darwin* | powerpc64-*-*) config_arch="powerpc970" ;; mips64el-*-*) config_arch="mips64el" ;; armv5tel-*-*) config_arch="armv5tel" ;; sparc64-*-*) config_arch="sparc64" ;; ia64-*-*) config_arch="ia64" ;; hppa2.0-*-*) config_arch="hppa2.0" ;; *) config_arch="default" ;; esac # See if this is a Core 2, if we have /proc/cpuinfo core2warn=no if test x"$config_arch" = xathlon64; then if test -f /proc/cpuinfo; then if $EGREP -q "Core\(TM\)2" /proc/cpuinfo; then config_arch=core2 elif $EGREP -q "Core\(TM\) i5" /proc/cpuinfo; then config_arch=corei5 fi; else # No /proc/cpuinfo, tell user about ecm-params.h.core2 core2warn=yes fi; fi # See if this is a Pentium 4, if we have /proc/cpuinfo pentium4warn=no if test x"$config_arch" = xdefault; then if test -f /proc/cpuinfo; then if $EGREP -q "Pentium\(R\) 4" /proc/cpuinfo; then config_arch=pentium4 fi; else # No /proc/cpuinfo, tell user about ecm-params.h.pentium4 pentium4warn=yes fi; fi LIBS="$LIBS $GWLIB" GMP_FINISH AC_CONFIG_FILES([Makefile athlon/Makefile pentium4/Makefile x86_64/Makefile powerpc64/Makefile build.vc10/Makefile build.vc10/assembler/Makefile build.vc10/ecm/Makefile build.vc10/libecm/Makefile build.vc10/tune/Makefile build.vc10/bench_mulredc/Makefile]) AC_CONFIG_LINKS([ecm-params.h:ecm-params.h.$config_arch]) MUL_FFT_PARAMS="mul_fft-params.h.$config_arch" if ! test -f "$MUL_FFT_PARAMS" then MUL_FFT_PARAMS="mul_fft-params.h.default" fi AC_CONFIG_LINKS([mul_fft-params.h:"$MUL_FFT_PARAMS"]) AC_SUBST([XSLDIR]) AC_SUBST([ASMPATH]) AC_SUBST([GSL_LD_FLAGS]) AC_OUTPUT AC_MSG_NOTICE([Configuration:]) AC_MSG_NOTICE([Build for host type $host]) AC_MSG_NOTICE([CC=$CC, CFLAGS=$CFLAGS]) AC_MSG_NOTICE([Linking GMP with $GMPLIB]) if test "x$enable_asm_redc" = xyes; then AC_MSG_NOTICE([Using asm redc code from directory $ASMPATH]) else AC_MSG_NOTICE([Not using asm redc code]) fi if test "x$enable_sse2" = xyes; then AC_MSG_NOTICE([Using SSE2 instructions in NTT code]) else AC_MSG_NOTICE([Not using SSE2 instructions in NTT code]) fi if test "x$with_gwnum" != "x"; then AC_MSG_NOTICE([Linking with George Woltman's GWNUM]) fi if test "x$enable_assert" = xyes; then AC_MSG_NOTICE([Assertions enabled]) else AC_MSG_NOTICE([Assertions disabled]) fi if test "x$enable_shellcmd" = xyes; then AC_MSG_NOTICE([Shell command execution enabled]) else AC_MSG_NOTICE([Shell command execution disabled]) fi if test "x$enable_openmp" = xyes; then AC_MSG_NOTICE([OpenMP enabled]) else AC_MSG_NOTICE([OpenMP disabled]) fi if test "x$enable_memory_debug" = xyes; then AC_MSG_NOTICE([Memory debugging enabled]) else AC_MSG_NOTICE([Memory debugging disabled]) fi if test x"$core2warn" = xyes; then AC_MSG_NOTICE([Your cpu was detected as x86_64; if it is a Core 2, please either use the ecm-params.h.core2 file by executing the commands:]) AC_MSG_NOTICE([rm ecm-params.h]) AC_MSG_NOTICE([ln -s ecm-params.h.core2 ecm-params.h]) AC_MSG_NOTICE([or generate a custom ecm-params.h file for your system as described in INSTALL.]) fi if test x"$pentium4warn" = xyes; then AC_MSG_NOTICE([Your cpu was detected as default; if it is a Pentium 4, please either use the ecm-params.h.pentium4 file by executing the commands:]) AC_MSG_NOTICE([rm ecm-params.h]) AC_MSG_NOTICE([ln -s ecm-params.h.pentium4 ecm-params.h]) AC_MSG_NOTICE([or generate a custom ecm-params.h file for your system as described in INSTALL.]) fi ecm-6.4.4/ecm.xml0000644023561000001540000005751012113417004010502 00000000000000 ECM 1 April 22, 2003 ecm integer factorization using ECM, P-1 or P+1 ecm B1 B2min-B2maxB2 DESCRIPTION ecm is an integer factoring program using the Elliptic Curve Method (ECM), the P-1 method, or the P+1 method. The following sections describe parameters relevant to these algorithms. STEP 1 AND STEP 2 BOUND PARAMETERS B1 B1 is the step 1 bound. It is a mandatory parameter. It can be given either in integer format (for example 3000000) or in floating-point format (3000000.0 or 3e6). The largest possible B1 value is 9007199254740996 for P-1, and ULONG_MAX or 9007199254740996 (whichever is smaller) for ECM and P+1. All primes 2 <= p <= B1 are processed in step 1. B2 B2 is the step 2 bound. It is optional: if omitted, a default value is computed from B1, which should be close to optimal. Like B1, it can be given either in integer or in floating-point format. The largest possible value of B2 is approximately 9e23, but depends on the number of blocks k if you specify the option. All primes B1 <= p <= B2 are processed in step 2. If B2 < B1, no step 2 is performed. B2min-B2max alternatively one may use the B2min-B2max form, which means that all primes B2min <= p <= B2max should be processed. Thus specifying B2 only corresponds to B1-B2. The values of B2min and B2max may be arbitrarily large, but their difference must not exceed approximately 9e23, subject to the number of blocks k. FACTORING METHOD Perform P-1 instead of the default method (ECM). Perform P+1 instead of the default method (ECM). GROUP AND INITIAL POINT PARAMETERS [ECM, P-1, P+1] Use x (arbitrary-precision integer or rational) as initial point. For example, is valid. If not given, x is generated from the sigma value for ECM, or at random for P-1 and P+1. [ECM] Use s (arbitrary-precision integer) as curve generator. If omitted, s is generated at random. [ECM] Use a (arbitrary-precision integer) as curve parameter. If omitted, is it generated from the sigma value. [ECM, P-1, P+1] Multiply the initial point by val, which can any valid expression, possibly containing the special character N as place holder for the current input number. Example: ecm -pp1 -go "N^2-1" 1e6 < composite2000 STEP 2 PARAMETERS [ECM, P-1, P+1] Perform k blocks in step 2. For a given B2 value, increasing k decreases the memory usage of step 2, at the expense of more cpu time. Stores some tables of data in disk files to reduce the amount of memory occupied in step 2, at the expense of disk I/O. Data will be written to files file.1, file.2 etc. Does not work with fast stage 2 for P+1 and P-1. [ECM, P-1] Use x^n for Brent-Suyama's extension ( disables Brent-Suyama's extension). The default polynomial is chosen depending on the method and B2. For P-1 and P+1, disables the fast stage 2. For P-1, n must be even. [ECM, P-1] Use degree-n Dickson's polynomial for Brent-Suyama's extension. For P-1 and P+1, disables the fast stage 2. Like for , n must be even for P-1. Use at most n megabytes of memory in stage 2. Enable or disable the Number-Theoretic Transform code for polynomial arithmetic in stage 2. With NTT, dF is chosen to be a power of 2, and is limited by the number suitable primes that fit in a machine word (which is a limitation only on 32 bit systems). The -no-ntt variant uses more memory, but is faster than NTT with large input numbers. By default, NTT is used for P-1, P+1 and for ECM on numbers of size at most 30 machine words. OUTPUT Quiet mode. Found factorizations are printed on standard output, with factors separated by white spaces, one line per input number (if no factor was found, the input number is simply copied). Verbose mode. More information is printed, more options increase verbosity. With one , the kind of modular multiplication used, initial x0 value, step 2 parameters and progress, and expected curves and time to find factors of different sizes for ECM are printed. With , the A value for ECM and residues at the end of step 1 and step 2 are printed. More print internal data for debugging. Print a time stamp whenever a new ECM curve or P+1 or P-1 run is processed. MODULAR ARITHMETIC OPTIONS Several algorithms are available for modular multiplication. The program tries to find the best one for each input; one can force a given method with the following options. Use GMP's mpz_mod function (sub-quadratic for large inputs, but induces some overhead for small ones). Use Montgomery's multiplication (quadratic version). Usually best method for small input. Use Montgomery's multiplication (sub-quadratic version). Theoretically optimal for large input. Disable special base-2 code (which is used when the input number is a large factor of 2^n+1 or 2^n-1, see ). n Force use of special base-2 code, input number must divide 2^n+1 if n > 0, or 2^|n|-1 if n < 0. FILE I/O The following options enable one to perform step 1 and step 2 separately, either on different machines, at different times, or using different software (in particular, George Woltman's Prime95/mprime program can produce step 1 output suitable for resuming with GMP-ECM). It can also be useful to split step 2 into several runs, using the B2min-B2max option. Take input from file file instead of from standard input. Save result of step 1 in file. If file exists, an error is raised. Example: to perform only step 1 with B1=1000000 on the composite number in the file "c155" and save its result in file "foo", use ecm -save foo 1e6 1 < c155 Like , but appends to existing files. Resume residues from file, reads from standard input if file is "-". Example: to perform step 2 following the above step 1 computation, use ecm -resume foo 1e6 Periodically write the current residue in stage 1 to file. In case of a power failure, etc., the computation can be continued with the option. ecm -chkpnt foo -pm1 1e10 < largenumber.txt LOOP MODE The loop mode (option ) enables one to run several curves on each input number. The following options control its behavior. Perform n runs on each input number (default is one). This option is mainly useful for P+1 (for example with n=3) or for ECM, where n could be set to the expected number of curves to find a d-digit factor with a given step 1 bound. This option is incompatible with . Giving produces an infinite loop until a factor is found. In loop mode, stop when a factor is found; the default is to continue until the cofactor is prime or the specified number of runs are done. Breadth-first processing: in loop mode, run one curve for each input number, then a second curve for each one, and so on. This is the default mode with . Depth-first processing: in loop mode, run n curves for the first number, then n curves for the second one and so on. This is the default mode with standard input. In loop mode, in the second and following runs, output only expressions that have at most n characters. Default is . In loop mode, increment B1 by n after each curve. In loop mode, multiply B1 by a factor depending on n after each curve. Default is one which should be optimal on one machine, while could be used when trying to factor the same number simultaneously on 10 identical machines. SHELL COMMAND EXECUTION These optins allow for executing shell commands to supplement functionality to GMP-ECM. Execute command cmd to test primality if factors and cofactors instead of GMP-ECM's own functions. The number to test is passed via stdin. An exit code of 0 is interpreted as probably prime, a non-zero exit code as composite. Executes command cmd whenever a factor is found by P-1, P+1 or ECM. The input number, factor and cofactor are passed via stdin, each on a line. This could be used i.e. to mail new factors automatically: ecm -faccmd 'mail -s $HOSTNAME found a factor me@myaddress.com' 11e6 < cunningham.in Executes command cmd before each ECM curve, P-1 or P+1 attempt on a number is started. If the exit status of cmd is non-zero, GMP-ECM terminates immediately, otherwise it continues normally. GMP-ECM is stopped while cmd runs, offering a way for letting GMP-ECM sleep for example while the system is otherwise busy. MISCELLANEOUS Run the program in nice mode (below normal priority). Run the program in very nice mode (idle priority). Multiply the default step 2 bound B2 by the floating-point value f. Example: divides the default B2 by 2. Add n seconds to stage 1 time. This is useful to get correct expected time with -v if part of stage 1 was done in another run. Force cofactor output in decimal (even if expressions are used). , Display a short description of ecm usage, parameters and command line options. Prints configuration parameters used for the compilation and exits. INPUT SYNTAX The input numbers can have several forms: Raw decimal numbers like 123456789. Comments can be placed in the file: everything after // is ignored, up to the end of line. Line continuation. If a line ends with a backslash character \, it is considered to continue on the next line. Common arithmetic expressions can be used. Example: 3*5+2^10. Factorial: example 53!. Multi-factorial: example 15!3 means 15*12*9*6*3. Primorial: example 11# means 2*3*5*7*11. Reduced primorial: example 17#5 means 5*7*11*13*17. Functions: currently, the only available function is Phi(x,n). EXIT STATUS The exit status reflects the result of the last ECM curve or P-1/P+1 attempt the program performed. Individual bits signify particular events, specifically: Bit 0 0 if normal program termination, 1 if error occured Bit 1 0 if no proper factor was found, 1 otherwise Bit 2 0 if factor is composite, 1 if factor is a probable prime Bit 3 0 if cofactor is composite, 1 if cofactor is a probable prime Thus, the following exit status values may occur: 0 Normal program termination, no factor found 1 Error 2 Composite factor found, cofactor is composite 6 Probable prime factor found, cofactor is composite 8 Input number found 10 Composite factor found, cofactor is a probable prime 14 Probable prime factor found, cofactor is a probable prime BUGS Report bugs to <ecm-discuss@lists.gforge.inria.fr>, after checking <http://www.loria.fr/~zimmerma/records/ecmnet.html> for bug fixes or new versions. AUTHORS Pierrick Gaudry <gaudry at lix dot polytechnique dot fr> contributed efficient assembly code for combined mul/redc; Jim Fougeron <jfoug at cox dot net> contributed the expression parser and several command-line options; Laurent Fousse <laurent at komite dot net> contributed the middle product code, the autoconf/automake tools, and is the maintainer of the Debian package; Alexander Kruppa <(lastname)al@loria.fr> contributed estimates for probability of success for ECM, the new P+1 and P-1 stage 2 (with P. L. Montgomery), new AMD64 asm mulredc code, and some other things; Dave Newman <david.(lastname)@jesus.ox.ac.uk> contributed the Kronecker-Schoenhage and NTT multiplication code; Jason S. Papadopoulos contributed a speedup of the NTT code Paul Zimmermann <zimmerma at loria dot fr> is the author of the first version of the program and chief maintainer of GMP-ECM. Note: email addresses have been obscured, the required substitutions should be obvious. ecm-6.4.4/spm.c0000644023561000001540000001566612106741273010177 00000000000000/* spm.c - "small prime modulus" functions to precompute an inverse and a primitive root for a small prime Copyright 2005, 2006, 2008, 2009, 2010, 2012 Dave Newman, Jason Papadopoulos, Paul Zimmermann, Alexander Kruppa. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "sp.h" /* Returns the exponent of $q$ in the factorisation of $n$ */ static int exponent (const sp_t q, sp_t n) { int i; for (i = 0; n % q == (sp_t) 0; i++, n /= q); return i; } /* Returns i so that ord(a) = q^i. This assumes that ord(a) is indeed a low power of q. */ static int ordpow (const sp_t q, sp_t a, const sp_t sp, const sp_t mul_c) { int i = 0; for (i = 0; a != (sp_t) 1; i++, a = sp_pow (a, q, sp, mul_c)); return i; } /* initialize roots of unity and twiddle factors for one NTT. If successful, returns 1. If unsuccessful, returns 0 (and frees allocated memory) */ static int nttdata_init (const sp_t sp, const sp_t mul_c, const sp_t prim_root, const spv_size_t log2_len, sp_nttdata_t data, spv_size_t breakover) { spv_t r, t; spv_size_t i, j, k; r = data->ntt_roots = (spv_t) sp_aligned_malloc ((log2_len + 1) * sizeof(sp_t)); if (r == NULL) return 0; i = log2_len; r[i] = prim_root; for (i--; (int)i >= 0; i--) r[i] = sp_sqr (r[i+1], sp, mul_c); k = MIN(log2_len, breakover); t = data->twiddle = (spv_t) sp_aligned_malloc (sizeof(sp_t) << k); if (t == NULL) { sp_aligned_free (r); return 0; } data->twiddle_size = 1 << k; for (i = k; i; i--) { sp_t w = r[i]; for (j = t[0] = 1; j < ((spv_size_t) 1 << (i-1)); j++) t[j] = sp_mul (t[j-1], w, sp, mul_c); t += j; } return 1; } static void nttdata_clear(sp_nttdata_t data) { sp_aligned_free(data->ntt_roots); sp_aligned_free(data->twiddle); } /* Compute some constants, including a primitive n'th root of unity. Returns NULL in case of error. k is the number of limbs of the number to factor */ spm_t spm_init (spv_size_t n, sp_t sp, mp_size_t k) { sp_t a, b, bd, sc; spv_size_t q, nc, ntt_power; spm_t spm = (spm_t) malloc (sizeof (__spm_struct)); if (spm == NULL) return NULL; ASSERT (sp % (sp_t) n == (sp_t) 1); spm->sp = sp; sp_reciprocal (spm->mul_c, sp); /* compute spm->invm = -1/p mod B where B = 2^GMP_NUMB_BITS */ a = sp_pow (2, GMP_NUMB_BITS, sp, spm->mul_c); /* a = B mod p */ a = sp_inv (a, sp, spm->mul_c); /* a = 1/B mod p */ /* a = 1/B mod p thus B*a - 1 = invm*p */ a --; b = GMP_NUMB_MASK; #if SP_NUMB_BITS == W_TYPE_SIZE - 2 a = (a << 2) + (b >> (GMP_NUMB_BITS - 2)); b = (b << 2) & GMP_NUMB_MASK; udiv_qrnnd (bd, sc, a, b, sp << 2); #else a = (a << 1) + (b >> (GMP_NUMB_BITS - 1)); b = (b << 1) & GMP_NUMB_MASK; udiv_qrnnd (bd, sc, a, b, sp << 1); #endif spm->invm = bd; /* compute spm->Bpow = B^(k+1) mod p */ spm->Bpow = sp_pow (2, GMP_NUMB_BITS * (k + 1), sp, spm->mul_c); /* find an $n$-th primitive root $a$ of unity $(mod sp)$. */ /* Construct a $b$ whose order $(mod sp)$ is equal to $n$. We try different $a$ values and test if the exponent of $q$ in $ord(a)$ is at least as large as in $n$. If it isn't, we move to another $a$. If it is, we optionally exponentiate to make the exponents equal and test for the remaining $q$'s. We assume that the largest prime dividing $n$ is very small, so no optimizations in factoring n are made. */ a = 2; b = a; nc = n; /* nc is remaining cofactor of n */ q = 2; sc = sp - 1; #ifdef PARI printf ("/* spm_init */ n = %lu; sp = %lu; /* PARI */\n", n, sp); printf ("exponent(a,b) = {local(i); while(b%%a == 0,i++;b/=a); " "return(i)} /* PARI */\n"); #endif for ( ; nc != (spv_size_t) 1; q++) { if (nc % q == (spv_size_t) 0) { const int k = exponent (q, n); /* q^k || n */ sp_t d; int l; #ifdef PARI printf ("exponent(%lu, n) == %d /* PARI */\n", q, k); #endif /* Remove all factors of $q$ from $sp-1$ */ for (d = sp - 1; d % q == (spv_size_t) 0; d /= q); bd = sp_pow (b, d, sp, spm->mul_c); /* Now ord(bd) = q^l, q^l || ord(a) */ l = ordpow (q, bd, sp, spm->mul_c); #ifdef PARI printf ("exponent(%lu, znorder(Mod(%lu, sp))) == %d /* PARI */\n", q, b, l); #endif if (l < k) { /* No good, q appears in ord(a) in a lower power than in n. Try next $a$ */ a++; b = a; nc = n; q = 1; /* Loop increment following "continue" will make q=2 */ sc = sp - 1; continue; } else { /* Reduce the exponent of $q$ in $ord(b)$ until is it equal to that in $n$ */ for ( ; l > k; l--) { #ifdef PARI printf ("Exponentiating %lu by %lu\n", b, q); #endif b = sp_pow (b, q, sp, spm->mul_c); } #ifdef PARI printf ("New b = %lu\n", b); #endif } do {nc /= q;} while (nc % q == 0); /* Divide out all q from nc */ while (sc % q == (sp_t) 0) /* Divide out all q from sc */ sc /= q; } } b = sp_pow (b, sc, sp, spm->mul_c); #ifdef PARI printf ("znorder(Mod(%lu, sp)) == n /* PARI */\n", b, sp, n); #endif /* turn this into a primitive n'th root of unity mod p */ spm->prim_root = b; spm->inv_prim_root = sp_inv (b, sp, spm->mul_c); /* initialize auxiliary data for all supported power-of-2 NTT sizes */ ntt_power = 0; while (1) { if (n & (1 << ntt_power)) break; ntt_power++; } if (!nttdata_init (sp, spm->mul_c, sp_pow (spm->prim_root, n >> ntt_power, sp, spm->mul_c), ntt_power, spm->nttdata, NTT_GFP_TWIDDLE_DIF_BREAKOVER)) goto free_spm; if (!nttdata_init (sp, spm->mul_c, sp_pow (spm->inv_prim_root, n >> ntt_power, sp, spm->mul_c), ntt_power, spm->inttdata, NTT_GFP_TWIDDLE_DIT_BREAKOVER)) goto free_nttdata; spm->scratch = (spv_t) sp_aligned_malloc ( MAX_NTT_BLOCK_SIZE * sizeof(sp_t)); if (spm->scratch == NULL) goto free_inttdata; return spm; free_inttdata: nttdata_clear (spm->inttdata); free_nttdata: nttdata_clear (spm->nttdata); free_spm: free (spm); return NULL; } void spm_clear (spm_t spm) { nttdata_clear (spm->nttdata); nttdata_clear (spm->inttdata); sp_aligned_free (spm->scratch); free (spm); } ecm-6.4.4/tune.c0000644023561000001540000003454112106741273010344 00000000000000/* Tune program for GMP-ECM. Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Paul Zimmermann, Alexander Kruppa, Dave Newman and Jason Papadopoulos. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-gmp.h" #include "ecm-impl.h" /* 250ms, we (probably) don't need any more precision */ #define GRANULARITY 250 #define MAX_LOG2_LEN 18 /* 2 * 131072 */ #define MAX_LEN (1U << max_log2_len) #define MAX_LOG2_MPZSPV_NORMALISE_STRIDE (MIN (12, max_log2_len)) #define M_str "95209938255048826235189575712705128366296557149606415206280987204268594538412191641776798249266895999715600261737863698825644292938050707507901970225804581" #define ELAPSED elltime (__st, cputime () ) #define TUNE_FUNC_START(x) \ double x (size_t n) \ { unsigned int __i, __k = 1; long __st; /* Keep doubling the number of iterations until the timing is at least GRANULARITY */ #define TUNE_FUNC_LOOP(x) \ do { \ do { \ __st = cputime (); \ for (__i = 0; __i < __k; __i++) { x; } \ __k *= 2; \ } while (ELAPSED < GRANULARITY); \ __k /= 2; \ __st = ELAPSED; \ } while (0) #define TUNE_FUNC_END(x) \ if (tune_verbose) \ fprintf (stderr, #x "(%2ld) = %f\n", (long)n, (double) __k / (double) __st); \ return (double) __k / (double) __st; } /* Throughout, each function pointer points to a function * * double f0 (size_t n); * * that runs for at least GRANULARITY ms and then returns the number of * iterations performed per ms. * * X_Y_THRESHOLD denotes the threshold at which to start using Y for X. */ mpz_t M; /* yes, global variables */ gmp_randstate_t gmp_randstate; size_t mp_size; mpzspm_t mpzspm; mpzv_t x, y, z, t; spm_t spm; spv_t spv; mpzspv_t mpzspv; int tune_verbose; int max_log2_len = MAX_LOG2_LEN; int min_log2_len = 3; size_t MPZMOD_THRESHOLD; size_t REDC_THRESHOLD; size_t NTT_GFP_TWIDDLE_DIF_BREAKOVER = MAX_LOG2_LEN; size_t NTT_GFP_TWIDDLE_DIT_BREAKOVER = MAX_LOG2_LEN; size_t MUL_NTT_THRESHOLD; size_t PREREVERTDIVISION_NTT_THRESHOLD; size_t POLYINVERT_NTT_THRESHOLD; size_t POLYEVALT_NTT_THRESHOLD; size_t MPZSPV_NORMALISE_STRIDE = 256; void mpz_quick_random (mpz_t x, mpz_t M, unsigned long b) { mpz_urandomb (x, gmp_randstate, b); if (mpz_cmp (x, M) >= 0) mpz_sub (x, x, M); } double tune_mpres_mul (mp_size_t limbs, int repr) { mpmod_t modulus; mpres_t x, y, z; mpz_t N, p, q; unsigned int __k = 1, __i; long __st; mpz_init (N); mpz_init (p); mpz_init (q); /* No need to generate a probable prime, just ensure N is not divisible by 2 or 3 */ do { mpz_urandomb (N, gmp_randstate, limbs * GMP_NUMB_BITS); while (mpz_gcd_ui (NULL, N, 6) != 1) mpz_add_ui (N, N, 1); } while ((mp_size_t) mpz_size (N) != limbs); if (repr == ECM_MOD_MPZ) mpmod_init_MPZ (modulus, N); else if (repr == ECM_MOD_MODMULN) mpmod_init_MODMULN (modulus, N); else if (repr == ECM_MOD_REDC) mpmod_init_REDC (modulus, N); mpz_urandomm (p, gmp_randstate, N); mpz_urandomm (q, gmp_randstate, N); mpres_init (x, modulus); mpres_init (y, modulus); mpres_init (z, modulus); mpres_set_z (x, p, modulus); mpres_set_z (y, q, modulus); TUNE_FUNC_LOOP (mpres_mul (z, x, y, modulus)); mpres_clear (x, modulus); mpres_clear (y, modulus); mpres_clear (z, modulus); mpmod_clear (modulus); mpz_clear (N); mpz_clear (p); mpz_clear (q); return (double) __k / (double) __st; } double tune_mpres_sqr (mp_size_t limbs, int repr) { mpmod_t modulus; mpres_t x, z; mpz_t N, p; unsigned int __k = 1, __i; long __st; mpz_init (N); mpz_init (p); /* No need to generate a probable prime, just ensure N is not divisible by 2 or 3 */ do { mpz_urandomb (N, gmp_randstate, limbs * GMP_NUMB_BITS); while (mpz_gcd_ui (NULL, N, 6) != 1) mpz_add_ui (N, N, 1); } while ((mp_size_t) mpz_size (N) != limbs); if (repr == ECM_MOD_MPZ) mpmod_init_MPZ (modulus, N); else if (repr == ECM_MOD_MODMULN) mpmod_init_MODMULN (modulus, N); else if (repr == ECM_MOD_REDC) mpmod_init_REDC (modulus, N); mpz_urandomm (p, gmp_randstate, N); mpres_init (x, modulus); mpres_init (z, modulus); mpres_set_z (x, p, modulus); TUNE_FUNC_LOOP (mpres_sqr (z, x, modulus)); mpres_clear (x, modulus); mpres_clear (z, modulus); mpmod_clear (modulus); mpz_clear (N); mpz_clear (p); return (double) __k / (double) __st; } double tune_mpres_mul_mpz (size_t n) { return tune_mpres_mul (n, ECM_MOD_MPZ); } double tune_mpres_mul_modmuln (size_t n) { return tune_mpres_mul (n, ECM_MOD_MODMULN); } double tune_mpres_mul_redc (size_t n) { return tune_mpres_mul (n, ECM_MOD_REDC); } TUNE_FUNC_START (tune_spv_ntt_gfp_dif) NTT_GFP_TWIDDLE_DIF_BREAKOVER = n; TUNE_FUNC_LOOP (spv_ntt_gfp_dif (spv, max_log2_len, spm)); TUNE_FUNC_END (tune_spv_ntt_gfp_dif) TUNE_FUNC_START (tune_spv_ntt_gfp_dit) NTT_GFP_TWIDDLE_DIT_BREAKOVER = n; TUNE_FUNC_LOOP (spv_ntt_gfp_dit (spv, max_log2_len, spm)); TUNE_FUNC_END (tune_spv_ntt_gfp_dit_recursive) TUNE_FUNC_START (tune_ntt_mul) MUL_NTT_THRESHOLD = 0; TUNE_FUNC_LOOP (ntt_mul (z, x, y, 1 << n, NULL, 1, mpzspm)); TUNE_FUNC_END (tune_ntt_mul) TUNE_FUNC_START (tune_list_mul) TUNE_FUNC_LOOP (list_mul (z, x, 1 << n, 1, y, 1 << n, 1, t)); TUNE_FUNC_END (tune_list_mul) TUNE_FUNC_START (tune_ntt_PrerevertDivision) PREREVERTDIVISION_NTT_THRESHOLD = 0; TUNE_FUNC_LOOP (ntt_PrerevertDivision (z, x, y, mpzspv, mpzspv, 1 << n, t, mpzspm)); TUNE_FUNC_END (tune_ntt_PrerevertDivision) TUNE_FUNC_START (tune_PrerevertDivision) TUNE_FUNC_LOOP (PrerevertDivision (z, x, y, 1 << n, t, mpzspm->modulus)); TUNE_FUNC_END (tune_PrerevertDivision) TUNE_FUNC_START (tune_ntt_PolyInvert) POLYINVERT_NTT_THRESHOLD = 1 << n; TUNE_FUNC_LOOP (ntt_PolyInvert (z, x, 1 << n, t, mpzspm)); TUNE_FUNC_END (tune_ntt_PolyInvert) TUNE_FUNC_START (tune_PolyInvert) TUNE_FUNC_LOOP (PolyInvert (z, x, 1 << n, t, mpzspm->modulus)); TUNE_FUNC_END (tune_PolyInvert) TUNE_FUNC_START (tune_ntt_polyevalT) unsigned int i; mpzv_t *Tree = (mpzv_t *) malloc ((n + 1) * sizeof (mpzv_t)); if (Tree == NULL) { fprintf (stderr, "Cannot allocate memory in tune_ntt_polyevalT\n"); exit (1); } for (i = 0; i <= n; i++) Tree[i] = x; POLYEVALT_NTT_THRESHOLD = 1 << n; TUNE_FUNC_LOOP (ntt_polyevalT (z, 1 << n, Tree, t, mpzspv, mpzspm, NULL)); free (Tree); TUNE_FUNC_END (tune_ntt_polyevalT) TUNE_FUNC_START (tune_polyevalT) unsigned int i; mpzv_t *Tree = (mpzv_t *) malloc ((n + 1) * sizeof (mpzv_t)); if (Tree == NULL) { fprintf (stderr, "Cannot allocate memory in tune_polyevalT\n"); exit (1); } for (i = 0; i <= n; i++) Tree[i] = x; TUNE_FUNC_LOOP (polyeval_tellegen (z, 1 << n, Tree, t, 3 * (1 << n), x, mpzspm->modulus, NULL)); free (Tree); TUNE_FUNC_END (tune_polyevalT) TUNE_FUNC_START (tune_mpzspv_normalise) MPZSPV_NORMALISE_STRIDE = 1 << n; TUNE_FUNC_LOOP (mpzspv_normalise (mpzspv, 0, 1 << MAX_LOG2_MPZSPV_NORMALISE_STRIDE, mpzspm)); TUNE_FUNC_END (tune_mpzspv_normalise) TUNE_FUNC_START (tune_ecm_mul_lo_n) mp_limb_t rp[2 * MPN_MUL_LO_THRESHOLD]; mp_limb_t xp[MPN_MUL_LO_THRESHOLD]; mp_limb_t yp[MPN_MUL_LO_THRESHOLD]; if (n > 1 && n < (mp_size + 1) / 2) return 0.0; mpn_random (xp, mp_size); mpn_random (yp, mp_size); mpn_mul_lo_threshold[mp_size] = n; TUNE_FUNC_LOOP (ecm_mul_lo_n (rp, xp, yp, mp_size)); TUNE_FUNC_END (tune_ecm_mul_lo_n) /* Return the lowest n with min_n <= n < max_n such that * f1(t) >= f0(t) for all t in [n, n + k), or return max_n if no such * n exists. This function will typically return high values if there * is no 'clean' threshold between f0(n) and f1(n). */ size_t crossover2 (double (*f0)(size_t), double (*f1)(size_t), size_t min_n, size_t max_n, size_t k) { size_t n = min_n; size_t t; while (n < max_n) { for (t = MIN (max_n, n + k); t > n; t--) { if ((f0)(t - 1) > (f1)(t - 1)) break; } if (t == n) return n; n = t; }; return max_n; } /* Assume f0 and f1 are monotone decreasing. Return the first n in the range * [min_n, max_n) for which f1(n) >= f0(n), or return max_n if no such n * exists. We use a bisection algorithm so the function is fast but * may give slightly varied results. */ size_t crossover (double (*f0)(size_t), double (*f1)(size_t), size_t min_n, size_t max_n) { size_t mid_n; #ifdef TUNE_SLOW return crossover2 (f0, f1, min_n, max_n, 1); #endif if (min_n == max_n) return min_n; mid_n = (max_n + min_n) / 2; return ((f0)(mid_n) > (f1)(mid_n)) ? crossover (f0, f1, mid_n + 1, max_n) : crossover (f0, f1, min_n, mid_n); } /* Return the n in the range [min_n, max_n) that maximises f(n). * We make no assumptions about the shape of f(n) and so evaluate * f at every point. */ size_t maximise (double (*f)(size_t), size_t min_n, size_t max_n) { size_t n, best_n = 0; double f_n, f_best_n = -1.0; for (n = min_n; n < max_n; n++) { f_n = f (n); if (f_n > f_best_n) { f_best_n = f_n; best_n = n; } } return best_n; } /* Debugging. Print the value of f0(n) and f1(n) and which is fastest. */ void print_timings (double (*f0)(size_t), double (*f1)(size_t), size_t min_n, size_t max_n) { size_t n; double f0_n, f1_n; for (n = min_n; n < max_n; n++) { f0_n = (f0)(n); f1_n = (f1)(n); printf ("n=%2ld: %8.2f %8.2f (f%d)\n", (long) n, f0_n, f1_n, (f0_n <= f1_n) ? 1 : 0); } } int main (int argc, char **argv) { spv_size_t i; unsigned long b; while (argc > 1) { if (strcmp (argv[1], "-v") == 0) { tune_verbose = 1; argc --; argv ++; } else if (argc > 2 && strcmp (argv[1], "-max_log2_len") == 0) { max_log2_len = atoi (argv[2]); if (max_log2_len < min_log2_len) max_log2_len = min_log2_len; argc -= 2; argv += 2; } else { fprintf (stderr, "Usage: tune [-v] [-max_log2_len nnn]\n"); exit (1); } } gmp_randinit_default (gmp_randstate); mpz_init_set_str (M, M_str, 10); b = (unsigned long) mpz_sizeinbase (M, 2); x = init_list (MAX_LEN); y = init_list (MAX_LEN); z = init_list (MAX_LEN); t = init_list (list_mul_mem (MAX_LEN / 2) + 3 * MAX_LEN / 2); mpzspm = mpzspm_init (MAX_LEN, M); if (mpzspm == NULL) { fprintf (stderr, "Error, cannot allocate memory in mpzspm_init\n"); exit (1); } mpzspv = mpzspv_init (MAX_LEN, mpzspm); if (mpzspv == NULL) { fprintf (stderr, "Error, cannot allocate memory in mpzspv_init\n"); exit (1); } mpzspv_random (mpzspv, 0, MAX_LEN, mpzspm); for (i = 0; i < MAX_LEN; i++) mpz_quick_random (x[i], M, b); for (i = 0; i < MAX_LEN; i++) mpz_quick_random (y[i], M, b); for (i = 0; i < MAX_LEN; i++) mpz_quick_random (z[i], M, b); spm = mpzspm->spm[0]; spv = mpzspv[0]; MPZMOD_THRESHOLD = crossover2 (tune_mpres_mul_modmuln, tune_mpres_mul_mpz, 1, 512, 10); printf ("#define MPZMOD_THRESHOLD %lu\n", (unsigned long) MPZMOD_THRESHOLD); REDC_THRESHOLD = crossover2 (tune_mpres_mul_mpz, tune_mpres_mul_redc, MPZMOD_THRESHOLD, 512, 10); printf ("#define REDC_THRESHOLD %lu\n", (unsigned long) REDC_THRESHOLD); mpn_mul_lo_threshold[0] = 0; mpn_mul_lo_threshold[1] = 0; printf ("#define MPN_MUL_LO_THRESHOLD_TABLE {0, 0"); for (mp_size = 2; mp_size < MPN_MUL_LO_THRESHOLD; mp_size++) { mpn_mul_lo_threshold[mp_size] = maximise (tune_ecm_mul_lo_n, 0, mp_size); printf (", %lu", (unsigned long) mpn_mul_lo_threshold[mp_size]); fflush (stdout); } printf ("}\n"); NTT_GFP_TWIDDLE_DIF_BREAKOVER = maximise (tune_spv_ntt_gfp_dif, min_log2_len, max_log2_len); printf ("#define NTT_GFP_TWIDDLE_DIF_BREAKOVER %lu\n", (unsigned long) NTT_GFP_TWIDDLE_DIF_BREAKOVER); NTT_GFP_TWIDDLE_DIT_BREAKOVER = maximise (tune_spv_ntt_gfp_dit, min_log2_len, max_log2_len); printf ("#define NTT_GFP_TWIDDLE_DIT_BREAKOVER %lu\n", (unsigned long) NTT_GFP_TWIDDLE_DIT_BREAKOVER); MUL_NTT_THRESHOLD = 1 << crossover2 (tune_list_mul, tune_ntt_mul, 1, max_log2_len, 2); printf ("#define MUL_NTT_THRESHOLD %lu\n", (unsigned long) MUL_NTT_THRESHOLD); PREREVERTDIVISION_NTT_THRESHOLD = 1 << crossover2 (tune_PrerevertDivision, tune_ntt_PrerevertDivision, 1, max_log2_len, 2); printf ("#define PREREVERTDIVISION_NTT_THRESHOLD %lu\n", (unsigned long) PREREVERTDIVISION_NTT_THRESHOLD); POLYINVERT_NTT_THRESHOLD = 1 << crossover (tune_PolyInvert, tune_ntt_PolyInvert, 5, max_log2_len); printf ("#define POLYINVERT_NTT_THRESHOLD %lu\n", (unsigned long) POLYINVERT_NTT_THRESHOLD); POLYEVALT_NTT_THRESHOLD = 1 << crossover (tune_polyevalT, tune_ntt_polyevalT, 5, max_log2_len); printf ("#define POLYEVALT_NTT_THRESHOLD %lu\n", (unsigned long) POLYEVALT_NTT_THRESHOLD); MPZSPV_NORMALISE_STRIDE = 1 << maximise (tune_mpzspv_normalise, 1, MAX_LOG2_MPZSPV_NORMALISE_STRIDE); printf ("#define MPZSPV_NORMALISE_STRIDE %lu\n", (unsigned long) MPZSPV_NORMALISE_STRIDE); mpzspv_clear (mpzspv, mpzspm); mpzspm_clear (mpzspm); clear_list (x, MAX_LEN); clear_list (y, MAX_LEN); clear_list (z, MAX_LEN); clear_list (t, list_mul_mem (MAX_LEN / 2) + 3 * MAX_LEN / 2); mpz_clear (M); gmp_randclear (gmp_randstate); return 0; } ecm-6.4.4/INSTALL-ecm0000644023561000001540000002113312111113424010774 00000000000000Instructions to install GMP-ECM: 0) you first need to install the GNU MP (GMP) library. GNU MP is available from . Remark: GNU MP is already installed in most Linux distributions. However it is often an old version, moreover without processor-specific optimizations. If you care about efficiency, be sure to install the latest version of GNU MP, and to compile it for your particular processor. Warning: make sure you have only one version of GMP installed on your system at a given time. Frequently, after compiling GMP from source and installing it without removing the distribution's GMP package, later attempts to build software that uses GMP find the GMP header file from the distribution's GMP package and the library from the newly compiled GMP (or vice versa). GMP-ECM tries to detect this by comparing the version number from header and library; if this test fails, you should remove the obsolete GMP installation. 1) check your configuration with: $ ./configure The configure script accepts several options (see ./configure --help). In particular you can specify the GMP installation directory with: $ ./configure --with-gmp= where /include contains the header file gmp.h, and /lib contains the static or dynamic libraries (libgmp.a, libgmp.so, libgmp.lib). To compile the GMP-ECM library as a shared library, use the --enable-shared parameter for ./configure. Building a shared library is disabled by default. Note: the configure script will first search for a static GMP library, which makes GMP-ECM more efficient. When only a dynamic library is available, make sure to correctly set your dynamic libraries search path (LD_LIBRARY_PATH on Unix systems), otherwise the configure script may fail. Warning: it is recommended to use the same compiler and options as those used to compile GMP, otherwise the compilation may fail, or you may get poor performance. In the GMP build directory, simply type: $ egrep -w '(CC|CFLAGS)' config.log to see which compiler and options were used to build GMP. For example on a Sparc v9 you may have to type: $ ./configure CC=cc CFLAGS="-fast -fns=no -fsimple=1 -xarch=v9" Note 2: On x86, x86-64, and 64 bit PowerPC systems, using GMP-ECM's own modular multiplication code usually gives better performance than the GMP-based functions. On these systems, configure enables it by default. If the system is not identified correctly, you can enable it by adding the command line parameter "--enable-asm-redc" to configure. To disable it, add "--disable-asm-redc". On 32-bit x86 systems that have SSE2 (e.g., Pentium 4, some Celeron, some Sempron, Via C7), use of SSE2 instructions in stage 2 of P-1, P+1, and ECM is enabled by default. You can enable it manually by adding the command line parameter "--enable-sse2" and disable it by adding "--disable-sse2" to ./configure. The SSE2 code is not used in 64-bit builds, regardless of these parameters. Note 3: If you want to use George Woltman's GWNUM library for speeding up factoring base 2 numbers, obtain the source file from (on December 2011 the latest source is source272.zip), build the gwnum library for your operating system, then use $ ./configure --with-gwnum= The directory must include the gwnum.a or gwnum.lib file as well as gwnum.h and related header files. The source file of the gwnum library is available at . 2) compile the program with: $ make This will create the 'libecm.a' library, the 'libecm.so' shared library if --enable-shared was used, the 'ecm' binary file, the 'ecmfactor' binary file (sample use of libecm.a), and 'tune', a tuning program. 3) to check that the program works correctly, type: $ make check This will run several tests for P+1, P-1, ECM. These tests take a few minutes. It should normally end with "All ECM tests are ok." 4) (optional) to tune GMP-ECM, simply type: $ make ecm-params; make See also README ("How to get the best of GMP-ECM?"). Note: if your machine has not enough memory for the tune program, you can run it manually with ./tune -max_log2_len 16 for example (the default is 18). 5) (optional) you can then install the ecm binary and its man page: $ make install By default, installation will be done in /usr/local. You can change with the --prefix option of configure at step 1: $ ./configure --prefix= The ecm binary will go in /bin, its man page in /share/man/man1, the ecm library in /lib, and the corresponding header file in /include. You can also do "make uninstall" to remove those files. 6) If you like GMP-ECM, please help us factoring Cunningham numbers. First download "cunningham.in" on , then perform one ecm test with B1=110e6 on each number of this file: $ ./ecm 110e6 < cunningham.in > cunningham.out & If you find any factor (grep found cunningham.out), please submit it using the report form on . ============================================================================ Known problems: * [reported by Sam Rawlins] with MinGW under Windows XP (32-bit), the compilation fails in spv.c. A fix seems to add -msse2 to CFLAGS. See http://lists.gforge.inria.fr/pipermail/ecm-discuss/2010-June/004077.html * GCC 4.4 might miscompile GMP-ECM on Sparc, see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45559 for more details. The problem is due in fact to a bug in the Linux kernel. A fix is to use -mcpu=v8 with GCC 4.4 if the bug occurs. ============================================================================ For Windows users: Windows users have two options for building GMP-ECM: (a) the use of a number of Unix on Windows environments, or (b) the use of Microsoft Visual Studio C/C++ 2008. The former is described here while the latter is described in the readme.txt file within the build.vc10 subdirectory. (a) For Windows users with a Unix-like environment: Before you can compile GMP-ECM, you will need a compiler. Several suitable compilers are freely available, for example as part of MinGW, CygWin and Microsoft's Services for Unix (SFU). We recommend MinGW as it is a smaller download than the others and generates binaries that run on any Windows system, even if they don't have MinGW installed themselves. Step-by-step instructions, courtesy of Jes Hansen: 1) Download the current MinGW from http://prdownloads.sf.net/mingw/MinGW-3.1.0-1.exe?download and MSYS from http://prdownloads.sf.net/mingw/MSYS-1.0.10.exe?download 2) Create a folder, for example C:\GNU, and install MinGW (execute the MinGW-3.1.0-1.exe file) into C:\GNU\MinGW 3) Install MSYS (execute the MSYS-1.0.10.exe file) into C:\GNU\msys Now you get an icon on the desktop where you can start the MinSys. Do this, because is creates your home folder. Then exit it again. 4) Download the latest version of GMP (in February 2013 the latest version is 5.1.1, this will be assumed for the rest of this document) in .tar.bz2 format from http://gmplib.org/ and place it in your newly created home folder. The home folder is in C:\GNU\msys\home and has the same name as your Windows login name. 5) Download GMP-ECM (if you do not have it already) from http://ecm.gforge.inria.fr/ and place it in your home folder as well. 6) Start the MinSys up again from the desktop and type tar -xvjf gmp-5.1.1.tar.bz2 cd gmp-5.1.1 ./configure make install cd ~ 7) You are back in your home directory. Now type tar -xvzf ecm-6.4.4.tar.gz cd ecm-6.4.4 ./configure --with-gmp=/usr/local make 8) Four executables should have appeared. The main application is ecm.exe, which can be run from the Windows command line. ============================================================================ In case of a problem, report it to us, with: - the output of the config.log file - the versions of GMP-ECM and GMP used (first output line), for example: GMP-ECM 6.4.4 [configured with GMP 5.1.1, --enable-asm-redc] [P+1] - the detailed input enabling us to reproduce the problem, for example: $ echo 328006342451 | ./ecm -pp1 -x0 5 120 7043 - the output you get. Then send your bug report at . This is a public list, with archives available at . ecm-6.4.4/sets_long.c0000644023561000001540000004405712106741273011371 00000000000000/* Functions for sets of long ints, to factor (Z/NZ)* into a set of sums as described in section 5 of "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic Number Theory Symposium). Copyright 2007, 2008, 2009, 2012 Alexander Kruppa, Paul Zimmermann. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "ecm-impl.h" #include #ifdef HAVE_ALLOCA_H #include #endif #ifdef TESTDRIVE #include FILE *ECM_STDOUT, *ECM_STDERR; #endif /***************************************************************** Functions for processing sets A set is a cardinality of unsigned long type and an array of long ints. A set of sets is an unsigned long telling the number of sets, an array that has several sets stored back-to-back. *****************************************************************/ /* Copy a set from "*S" to "*T". Assumes that the sets do not overlap, or that T < S. */ static void set_copy (set_long_t *T, set_long_t *S) { unsigned long i; const unsigned long c = S->card; /* We might overwrite S->card */ T->card = c; for (i = 0UL; i < c; i++) T->elem[i] = S->elem[i]; } /* Exchange two adjacent sets in memory. Since all "elem" arrays are stored in the same chunk of allocated memory, and not in different chunks, we cannot simply swap the "elem" pointers. If the set T has size c and the next has size d, after the swap the set T will have size d and the next will have size c. */ static void set_swap (set_long_t *T) { set_long_t *next, *tmp; next = sets_nextset (T); tmp = alloca (set_sizeof (T->card)); ASSERT(tmp != NULL); set_copy (tmp, T); set_copy (T, next); /* warning: sets_nextset(T) might differ from next, if T and next had different sizes */ set_copy (sets_nextset(T), tmp); } /* Functions for sorting an array of longs */ static inline void swap_long (long *a, long *b) { long t; t = *a; *a = *b; *b = t; } static inline void swapsort_long (long *a, long *b) { if (*a > *b) swap_long (a, b); } void quicksort_long (long *a, unsigned long l) { unsigned long i, j; long pivot; if (l < 2) return; j = l - 1; swapsort_long (a, a+j); if (l == 2) return; i = j / 2; swapsort_long (a, a+i); swapsort_long (a+i, a+j); if (l == 3) return; pivot = a[i]; /* Median of three */ /* Stuff <= pivot goes in first list */ /* Invariant: a[0 ... i-1] <= pivot, a[j+1 ... l-1] > pivot */ for (i = 1; i < j;) if (a[i] > pivot) { for (; a[j] > pivot; j--); if (i < j) swap_long (a+(i++), a+j); } else i++; #ifdef WANT_ASSERT for (j = 0; j < i; j++) ASSERT (a[j] <= pivot); for (j = i; j < l; j++) ASSERT(a[j] > pivot); #endif quicksort_long (a, i); quicksort_long (a + i, l - i); #ifdef WANT_ASSERT for (j = 0; i < l - 1; i++) ASSERT (a[j] <= a[j + 1]); #endif } /* Returns max(S), where S == (Z/\beta Z)* as chosen by sets_get_factored_sorted() */ /* Assumes that S == 0 at recursion entry */ static void sets_max_recurse (mpz_t S, const unsigned long beta) { unsigned long P = beta, p, pk; unsigned int k; if (beta == 1UL) return; p = find_factor (P); k = 1; pk = p; P /= p; while (P % p == 0) { k++; pk *= p; P /= p; /* P*pk=beta is invariant */ } sets_max_recurse (S, P); mpz_mul_ui (S, S, pk); if (p == 2UL && k == 1) mpz_add_ui (S, S, P); else if (p == 2UL) mpz_add_ui (S, S, P * (pk / 2UL - 1UL)); else if (p % 4UL == 1UL) mpz_add_ui (S, S, P * ((pk + p) / 2UL - 2UL)); else if (p % 4UL == 3UL) mpz_add_ui (S, S, P * ((pk - 1UL) / 2UL)); else abort(); } void sets_max (mpz_t S, const unsigned long beta) { mpz_set_ui (S, 0UL); sets_max_recurse (S, beta); } /* Compute the set of sums over the "nr_sets" different sets in "*sets". The value of "add" is added to each element of the set of sums. "*sum" will have {\prod_{S \in "*sets"} #S} entries and must have enough memory allocated. This number of elements in the set of sums is the return value. In case of nr_sets == 0, "add" is written to *sets and 1 is returned. The sets in "*sets" are assumed to be non-empty. If "*sum" is NULL, nothing is written, but the return value is computed correctly. */ static unsigned long sets_sumset_recurse (long *sum, const set_long_t *sets, const unsigned long nr_sets, const long add) { unsigned long i, j = 0UL; if (nr_sets == 0UL) { if (sum != NULL) sum[0] = add; return 1UL; } ASSERT (sets->card > 0UL); for (i = 0UL; i < sets->card; i++) { /* Test for overflow */ ASSERT_ALWAYS (add <= 0 || add + sets->elem[i] > sets->elem[i]); ASSERT_ALWAYS (add >= 0 || add + sets->elem[i] < sets->elem[i]); j += sets_sumset_recurse (sum + j, sets_nextset(sets), nr_sets - 1UL, add + sets->elem[i]); } return j; } void sets_sumset (set_long_t *sum, const sets_long_t *sets) { sum->card = sets_sumset_recurse (sum->elem, sets->sets, sets->nr, 0L); } /* Returns the minimal (if minmax == -1) or maximal (minmax == 1) value in the set of sums over the sets in "*sets". */ void sets_sumset_minmax (mpz_t sum, const sets_long_t *sets, const int minmax) { unsigned long i, nr; const set_long_t *set = sets->sets; long extremum; ASSERT (minmax == 1 || minmax == -1); mpz_set_ui (sum, 0UL); for (nr = 0; nr < sets->nr; nr++) { ASSERT (set->card > 0UL); extremum = set->elem[0]; for (i = 1UL; i < set->card; i++) if ((minmax == -1 && set->elem[i] < extremum) || (minmax == 1 && set->elem[i] > extremum)) extremum = set->elem[i]; if (extremum >= 0) mpz_add_ui (sum, sum, extremum); else mpz_sub_ui (sum, sum, -extremum); set = sets_nextset (set); } return; } /* Store in (**L) arithmetic progressions of prime length whose sumset is k/2*R_n, an arithmetic progression centered at 0 of common difference k and cardinality n. If n is even, k must be as well to ensure integer results. I.e. n = 1: k/2*R_n = {0}, n = 2: k/2*R_n = k/2 * {1, -1}, n = 3: k/2*R_n = k * {-1, 0, 1}, n = 4: k/2*R_n = k/2 * {-3, -1, 1, 3}, n = 5: k/2*R_n = k * {-2, -1, 0, 1, 2} etc. _ADDS_ the size in bytes of the set to "*sets_size" */ static unsigned long sets_factored_Rn2 (set_long_t **L, size_t *sets_size, const long n, const long k) { unsigned long nr = 0UL; long i, m, q, r; size_t size = 0; /* n must be odd, or n and k both even */ ASSERT_ALWAYS(n % 2L == 1L || k % 2L == 0L); ASSERT(L != NULL); m = k; /* The multiplier accumulated so far, init to k */ r = n; /* The remaining cofactor of n */ for (q = 2L; r > 1L; q = (q + 1L) | 1L) /* Find prime factors of n */ { ASSERT (q <= r); while (r % q == 0L) { if (*L != NULL) { /* Add m*R_q/2 to list */ (*L)->card = q; for (i = 0L; i < q; i++) { const long t = m * (2L * i - q + 1L); ASSERT(t % 2L == 0L); (*L)->elem[i] = t / 2L; } *L = sets_nextset (*L); nr++; } size += set_sizeof ((unsigned long) q); /* Multiply this t to multiplier and treat remaining factors of the set */ m *= q; r /= q; } } if (sets_size != NULL) *sets_size += size; return nr; } /* Return a set L of sets M_i so that M_1 + ... + M_k is congruent to (Z/nZ)*, which is the set of residue classes coprime to n. The M_i all have prime cardinality. The size of the set of sets "*L" in bytes is computed and stored in "*sets_size" unless "*sets_size" is NULL. Return the number of sets in L. If L is the NULL pointer, nothing will be stored in L. The correct return value (number of set in L) and "*sets_size" value will still be computed, for example so that the correct amount of space can be allocated and factor_coprimeset() be called again. */ static unsigned long sets_factor_coprime (sets_long_t *sets, size_t *sets_size, const unsigned long n) { unsigned long r, k, nr = 0UL; long p, np; size_t size = sizeof (unsigned long); set_long_t *set = NULL; ASSERT (n > 0UL); if (sets != NULL) set = sets->sets; r = n; while (r > 1UL) { for (p = 2L; r % p > 0L; p++); /* Find smallest prime p that divides r */ for (k = 0UL; r % p == 0UL; k++, r /= p); /* Find p^k || r */ np = n/p; if (p == 2L && k == 1UL) /* Case 2^1. Deal with it before the */ { /* while loop below decreases k. */ if (set != NULL) { set->card = 1UL; set->elem[0] = np; set = sets_nextset (set); } size += set_sizeof (1UL); nr++; } /* If k > 1, do the \sum_{i=1}^{k-1} p^i (Z/pZ) part here. (Z/pZ) is represented by an arithmetic progression of common difference 1 and length p. */ while (k-- > 1UL) { nr += sets_factored_Rn2 (&set, &size, p, np); np /= p; } if (p % 4L == 3L) { /* We can use \hat{S}_p. Factor as {-(p+1)/4, (p+1)/4} + C_{(p-1)/2} */ /* Add the {-(p+1)/4, (p+1)/4} set to L */ nr += sets_factored_Rn2 (&set, &size, 2L, (p + 1L) / 2L * np); /* Add the np / 2 * R_{(p-1)/2} set to L */ nr += sets_factored_Rn2 (&set, &size, (p - 1L) / 2L, np); } else if (p % 4L == 1L) { /* Factor into arithmetic progressions of prime length. R_{p} = {-p+1, -p+3, ..., p-3, p+1}, i.e. R_2 = {-1, 1}, R_3 = {-2, 0, 2}, R_4 = {-3, -1, 1, 3} We have R_{sq} = R_q + q*R_s */ nr += sets_factored_Rn2 (&set, &size, p - 1L, 2L * np); } } if (sets_size != NULL) *sets_size = size; if (sets != NULL) sets->nr = nr; return nr; } /* Sort the sets in F into order of ascending cardinality. Uses a simple Bubble sort. */ static void sets_sort (sets_long_t *sets) { unsigned long i, nr_unsorted, highest_swap; set_long_t *set; /* The last sets->nr - nr_unsorted sets in "*sets" are known to be sorted and each one larger than any of the first nr_unsorted sets in "*sets". */ nr_unsorted = sets->nr; while (nr_unsorted > 1UL) { outputf (OUTPUT_TRACE, "nr_unsorted = %lu. ", nr_unsorted); sets_print (OUTPUT_TRACE, sets); set = sets->sets; highest_swap = 1UL; for (i = 1UL; i < nr_unsorted; i++) { if (set->card > sets_nextset(set)->card) { outputf (OUTPUT_TRACE, "sets_sort: swapping %lu and %lu\n", i - 1, i); set_swap (set); highest_swap = i; } set = sets_nextset (set); } nr_unsorted = highest_swap; } #ifdef WANT_ASSERT set = sets->sets; for (i = 0UL; i + 1UL < sets->nr; i++) { ASSERT(set->card <= sets_nextset (set)->card); set = sets_nextset (set); } #endif } /* Print all the sets in "*sets", formatted as a sum of sets */ void sets_print (const int verbosity, sets_long_t *sets) { unsigned long i, j; set_long_t *set = sets->sets; for (i = 0UL; i < sets->nr; i++) { if (i == 0UL) outputf (verbosity, "{"); else outputf (verbosity, " + {"); ASSERT(set->card > 0UL); outputf (verbosity, "%ld", set->elem[0]); for (j = 1UL; j < set->card; j++) outputf (verbosity, ", %ld", set->elem[j]); outputf (verbosity, "}"); set = sets_nextset (set); } outputf (verbosity, "\n"); } /* Extract sets whose set of sums has cardinality "d". We expect that "d" divides the cardinality of the set of sums of "sets" and that the cardinalities of the sets in "sets" are all prime. The amount of memory in bytes needed to store the extracted sets in "*extracted" is stored at "*extr_size". The number of sets extracted is returned. (If d = p_1 * ... * p_k, the return value is k and "*extr_size" is set_sizeof(p_1) + ... + set_sizeof(p_k).) If "*extracted" is NULL, nothing is written and no sets are removed from "*sets", but "*extr_size" is computed as if they were. */ void sets_extract (sets_long_t *extracted, size_t *extr_size, sets_long_t *sets, const unsigned long d) { unsigned long i, c, remaining_d = d; set_long_t *readfrom, *readnext, *moveto, *extractto = NULL; size_t extracted_size = sizeof (unsigned long); ASSERT_ALWAYS (d > 0UL); if (d == 1UL) { /* d == 1 means we need to extract a set of cardinality 1, which we most likely don't have in "*sets". (FIXME: check for set of cardinality 1?) We return the set containing only zero, which can be added to any set of sets without changing the set of sums */ if (extracted != NULL) { extracted->nr = 1; extractto = extracted->sets; extractto->card = 1UL; extractto->elem[0] = 0L; } if (extr_size != NULL) *extr_size = sizeof (unsigned long) + set_sizeof (1UL); return; } if (extracted != NULL) { extracted->nr = 0UL; extractto = extracted->sets; } /* All sets from *sets are read via *readfrom, and (assuming we actually extract them) are either copied to *extractto to *moveto */ readfrom = moveto = sets->sets; for (i = 0UL; i < sets->nr; i++) { c = readfrom->card; /* readfrom->card may get garbled */ readnext = sets_nextset (readfrom); if (remaining_d % c == 0UL) { if (extracted != NULL) { /* Copy this set to extractto */ set_copy (extractto, readfrom); extractto = sets_nextset (extractto); extracted->nr++; } remaining_d /= c; extracted_size += set_sizeof (c); } else { if (extracted != NULL) { /* Move this set within "*sets", filling the gaps left by extracted sets */ set_copy (moveto, readfrom); moveto = sets_nextset (moveto); } } readfrom = readnext; } ASSERT_ALWAYS (remaining_d == 1UL); if (extr_size != NULL) *extr_size = extracted_size; if (extracted != NULL) sets->nr -= extracted->nr; } sets_long_t * sets_get_factored_sorted (const unsigned long beta) { sets_long_t *sets; size_t size; sets_factor_coprime (NULL, &size, beta); sets = malloc (size); if (sets == NULL) return NULL; sets_factor_coprime (sets, NULL, beta); if (test_verbose (OUTPUT_TRACE)) { outputf (OUTPUT_TRACE, "sets_get_factored_sorted: Factored sets before sorting are "); sets_print (OUTPUT_TRACE, sets); } sets_sort (sets); if (test_verbose (OUTPUT_TRACE)) { outputf (OUTPUT_TRACE, "Factored sets after sorting are "); sets_print (OUTPUT_TRACE, sets); } return sets; } #ifdef TESTDRIVE static void selftest (const unsigned long beta) { sets_long_t *sets; set_long_t *sumset; unsigned long i, j, phibeta; mpz_t max; ASSERT_ALWAYS (beta > 0); sets = sets_get_factored_sorted (beta); /* Test that the sumset % beta is equal to (Z/betaZ)* % beta */ phibeta = eulerphi (beta); sumset = malloc (set_sizeof (phibeta)); if (sumset == NULL) { fprintf (stderr, "Cannot allocate memory in selftest\n"); exit (1); } sets_sumset (sumset, sets); ASSERT_ALWAYS (sumset->card = phibeta); /* Also test that max (sumset) == sets_max (beta) */ mpz_init (max); sets_max (max, beta); if (phibeta > 0) { long maxelem; maxelem = sumset->elem[0]; for (i = 1; i < phibeta; i++) if (maxelem < sumset->elem[i]) maxelem = sumset->elem[i]; ASSERT_ALWAYS (mpz_cmp_si (max, maxelem) == 0); } else { ASSERT_ALWAYS (mpz_cmp_ui (max, 0UL) == 0); } mpz_clear (max); /* printf ("sumset, before reduction: "); for (i = 0; i < phibeta; i++) printf ("%ld%s", sumset->elem[i], i < phibeta-1 ? ", " : "\n"); */ for (i = 0; i < phibeta; i++) { sumset->elem[i] = (sumset->elem[i] < 0L) ? beta - (long) ((unsigned long) (-sumset->elem[i]) % beta) : (unsigned long) sumset->elem[i] % beta; ASSERT_ALWAYS (sumset->elem[i] >= 0L); ASSERT_ALWAYS (sumset->elem[i] < (long) beta); } /* printf ("sumset, after reduction: "); for (i = 0; i < phibeta; i++) printf ("%ld%s", sumset->elem[i], i < phibeta-1 ? ", " : "\n"); */ quicksort_long (sumset->elem, sumset->card); /* printf ("sumset, after sorting: "); for (i = 0; i < phibeta; i++) printf ("%ld%s", sumset->elem[i], i < phibeta-1 ? ", " : "\n"); */ j = 0; for (i = 1; i < beta; i++) { if (gcd (i, beta) == 1) { if (sumset->elem[j] != (long) i) { printf ("sumset->elem[%ld] = %ld != %ld\n", j, sumset->elem[j], i); abort(); } j++; } } free (sumset); free (sets); } int main (int argc, char **argv) { unsigned long beta; const unsigned long selftest_max = 1000; int loop = 1; ECM_STDOUT = stdout; ECM_STDERR = stderr; if (argc > 1) { beta = atol (argv[1]); loop = 0; } if (!loop) set_verbose (OUTPUT_TRACE); if (!loop) selftest (beta); else { printf ("Testing beta = 1, ..., %lu\n", selftest_max); for (beta = 1; beta < selftest_max; beta++) selftest (beta); } return 0; } #endif ecm-6.4.4/champions.h0000644023561000001540000000131112111113424011327 00000000000000/* champions.h: defines the keepers of Top-10 lists for P-1, P+1, and ECM factors, and the size that is currently needed to enter the Top-10 */ /* people keeping track of champions and corresponding url's: ECM, P-1, P+1 */ static char *champion_keeper[3] = { "Richard Brent ", "Paul Zimmermann ", "Paul Zimmermann "}; static char *champion_url[3] = {"http://wwwmaths.anu.edu.au/~brent/ftp/champs.txt", "http://www.loria.fr/~zimmerma/records/Pminus1.html", "http://www.loria.fr/~zimmerma/records/Pplus1.html"}; /* minimal number of digits to enter the champions table for ECM, P-1, P+1 */ static unsigned int champion_digits[3] = { 70, 54, 48 }; ecm-6.4.4/mpzspm.c0000644023561000001540000002772112106741273010721 00000000000000/* mpzspm.c - "mpz small prime moduli" - pick a set of small primes large enough to represent a mpzv Copyright 2005, 2006, 2007, 2008, 2009, 2010 Dave Newman, Jason Papadopoulos, Paul Zimmermann, Alexander Kruppa. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include /* for printf */ #include #include "sp.h" #include "ecm-impl.h" /* Tables for the maximum possible modulus (in bit size) for different transform lengths l. The modulus is limited by the condition that primes must be p_i == 1 (mod l), and \Prod_i p_i >= 4l (modulus * S)^2, where S=\Sum_i p_i. Hence for each l=2^k, we take the product P and sum S of primes p_i, SP_MIN <= p_i <= SP_MAX and p_i == 1 (mod l), and store floor (log_2 (sqrt (P / (4l S^2)))) in the table. We only consider power-of-two transform lengths <= 2^31 here. Table entries generated with l=2^k;p=1;P=1;S=0;while(p<=SP_MAX, if(p>=SP_MIN && isprime(p), S+=p; P*=p); \ p+=l);print(floor (log2 (sqrt (P / (4*l * S^2))))) in Pari/GP for k=9 ... 24. k<9 simply were doubled and rounded down in each step. We curently assume that SP_MIN == 2^(SP_NUMB_BITS-1) and SP_MAX == 2^(SP_NUMB_BITS). */ #if (SP_NUMB_BITS == 30) static unsigned long sp_max_modulus_bits[32] = {0, 380000000, 190000000, 95000000, 48000000, 24000000, 12000000, 6000000, 3000000, 1512786, 756186, 378624, 188661, 93737, 46252, 23342, 11537, 5791, 3070, 1563, 782, 397, 132, 43, 0, 0, 0, 0, 0, 0, 0, 0}; #elif (SP_NUMB_BITS == 31) static unsigned long sp_max_modulus_bits[32] = {0, 750000000, 380000000, 190000000, 95000000, 48000000, 24000000, 12000000, 6000000, 3028766, 1512573, 756200, 379353, 190044, 94870, 47414, 23322, 11620, 5891, 2910, 1340, 578, 228, 106, 60, 30, 0, 0, 0, 0, 0, 0}; #elif (SP_NUMB_BITS == 32) static unsigned long sp_max_modulus_bits[32] = {0, 1520000000, 760000000, 380000000, 190000000, 95000000, 48000000, 24000000, 12000000, 6041939, 3022090, 1509176, 752516, 376924, 190107, 95348, 47601, 24253, 11971, 6162, 3087, 1557, 833, 345, 172, 78, 46, 15, 0, 0, 0, 0}; #elif (SP_NUMB_BITS >= 60) /* There are so many primes, we can do pretty much any modulus with any transform length. I didn't bother computing the actual values. */ static unsigned long sp_max_modulus_bits[32] = {0, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX, ULONG_MAX}; #else #error Table of maximal modulus for transform lengths not defined for this SP_MIN ; #endif /* Returns the largest possible transform length we can do for modulus without running out of primes */ spv_size_t mpzspm_max_len (mpz_t modulus) { int i; size_t b; b = mpz_sizeinbase (modulus, 2); /* b = floor (log_2 (modulus)) + 1 */ /* Transform length 2^k is ok if log2(modulus) <= sp_max_modulus_bits[k] <==> ceil(log2(modulus)) <= sp_max_modulus_bits[k] <==> floor(log_2(modulus)) + 1 <= sp_max_modulus_bits[k] if modulus isn't a power of 2 */ for (i = 0; i < 30; i++) { if (b > sp_max_modulus_bits[i + 1]) break; } return (spv_size_t)1 << i; } /* initialize mpzspm->T such that with m[j] := mpzspm->spm[j]->sp T[0][0] = m[0], ..., T[0][n-1] = m[n-1] ... T[d-1][0] = m[0]*...*m[ceil(n/2)-1], T[d-1][1] = m[ceil(n/2)] * ... * m[n-1] T[d][0] = m[0] * ... * m[n-1] where d = ceil(log(n)/log(2)). If n = 5, T[0]: 1, 1, 1, 1, 1 T[1]: 2, 2, 1 T[2]: 4, 1 */ static void mpzspm_product_tree_init (mpzspm_t mpzspm) { unsigned int d, i, j, oldn; unsigned int n = mpzspm->sp_num; mpzv_t *T; for (i = n, d = 0; i > 1; i = (i + 1) / 2, d ++); if (d <= I0_THRESHOLD) { mpzspm->T = NULL; return; } T = (mpzv_t*) malloc ((d + 1) * sizeof (mpzv_t)); T[0] = (mpzv_t) malloc (n * sizeof (mpz_t)); for (j = 0; j < n; j++) { mpz_init (T[0][j]); mpz_set_sp (T[0][j], mpzspm->spm[j]->sp); } for (i = 1; i <= d; i++) { oldn = n; n = (n + 1) / 2; T[i] = (mpzv_t) malloc (n * sizeof (mpz_t)); for (j = 0; j < n; j++) { mpz_init (T[i][j]); if (2 * j + 1 < oldn) mpz_mul (T[i][j], T[i-1][2*j], T[i-1][2*j+1]); else /* oldn is odd */ mpz_set (T[i][j], T[i-1][2*j]); } } mpzspm->T = T; mpzspm->d = d; } /* This function initializes a mpzspm_t structure which contains the number of small primes, the small primes with associated primitive roots and precomputed data for the CRT to allow convolution products of length up to "max_len" with modulus "modulus". Returns NULL in case of an error. */ mpzspm_t mpzspm_init (spv_size_t max_len, mpz_t modulus) { unsigned int ub, i, j; mpz_t P, S, T, mp, mt; /* mp is p as mpz_t, mt is a temp mpz_t */ sp_t p, a; mpzspm_t mpzspm; long st; st = cputime (); mpzspm = (mpzspm_t) malloc (sizeof (__mpzspm_struct)); if (mpzspm == NULL) return NULL; /* Upper bound for the number of primes we need. * Let minp, maxp denote the min, max permissible prime, * S the sum of p_1, p_2, ..., p_ub, * P the product of p_1, p_2, ..., p_ub/ * * Choose ub s.t. * * ub * log(minp) >= log(4 * max_len * modulus^2 * maxp^4) * * => P >= minp ^ ub >= 4 * max_len * modulus^2 * maxp^4 * >= 4 * max_len * modulus^2 * (ub * maxp)^2 * >= 4 * max_len * modulus^2 * S^2 * * So we need at most ub primes to satisfy this condition. */ ub = (2 + 2 * mpz_sizeinbase (modulus, 2) + ceil_log_2 (max_len) + \ 4 * SP_NUMB_BITS) / (SP_NUMB_BITS - 1); mpzspm->spm = (spm_t *) malloc (ub * sizeof (spm_t)); if (mpzspm->spm == NULL) goto error_clear_mpzspm; mpzspm->sp_num = 0; /* product of primes selected so far */ mpz_init_set_ui (P, 1UL); /* sum of primes selected so far */ mpz_init (S); /* T is len*modulus^2, the upper bound on output coefficients of a convolution */ mpz_init (T); mpz_mul (T, modulus, modulus); mpz_mul_ui (T, T, max_len); mpz_init (mp); mpz_init (mt); /* find primes congruent to 1 mod max_len so we can do * a ntt of size max_len */ /* Find the largest p <= SP_MAX that is p == 1 (mod max_len) */ p = (SP_MAX / (sp_t) max_len) * (sp_t) max_len; if (p == SP_MAX) /* If max_len | SP_MAX, the +1 might cause overflow */ p = p - (sp_t) max_len + (sp_t) 1; else p++; do { while (p >= SP_MIN && p > (sp_t) max_len && !sp_prime(p)) p -= (sp_t) max_len; /* all primes must be in range */ if (p < SP_MIN || p <= (sp_t) max_len) { outputf (OUTPUT_ERROR, "not enough primes == 1 (mod %lu) in interval\n", (unsigned long) max_len); goto error_clear_mpzspm_spm; } mpzspm->spm[mpzspm->sp_num] = spm_init (max_len, p, mpz_size (modulus)); if (mpzspm->spm[mpzspm->sp_num] == NULL) { outputf (OUTPUT_ERROR, "Out of memory in mpzspm_init()\n"); goto error_clear_mpzspm_spm; } mpzspm->sp_num++; mpz_set_sp (mp, p); mpz_mul (P, P, mp); mpz_add (S, S, mp); /* we want P > 4 * max_len * (modulus * S)^2. The S^2 term is due to theorem 3.1 in Bernstein and Sorenson's paper */ mpz_mul (T, S, modulus); mpz_mul (T, T, T); mpz_mul_ui (T, T, max_len); mpz_mul_2exp (T, T, 2UL); p -= (sp_t) max_len; } while (mpz_cmp (P, T) <= 0); outputf (OUTPUT_DEVVERBOSE, "mpzspm_init: finding %u primes took %lums\n", mpzspm->sp_num, cputime() - st); mpz_init_set (mpzspm->modulus, modulus); mpzspm->max_ntt_size = max_len; mpzspm->crt1 = (mpzv_t) malloc (mpzspm->sp_num * sizeof (mpz_t)); mpzspm->crt2 = (mpzv_t) malloc ((mpzspm->sp_num + 2) * sizeof (mpz_t)); mpzspm->crt3 = (spv_t) malloc (mpzspm->sp_num * sizeof (sp_t)); mpzspm->crt4 = (spv_t *) malloc (mpzspm->sp_num * sizeof (spv_t)); mpzspm->crt5 = (spv_t) malloc (mpzspm->sp_num * sizeof (sp_t)); if (mpzspm->crt1 == NULL || mpzspm->crt2 == NULL || mpzspm->crt3 == NULL || mpzspm->crt4 == NULL || mpzspm->crt5 == NULL) { outputf (OUTPUT_ERROR, "Out of memory in mpzspm_init()\n"); goto error_clear_crt; } for (i = 0; i < mpzspm->sp_num; i++) mpzspm->crt4[i] = NULL; for (i = 0; i < mpzspm->sp_num; i++) { mpzspm->crt4[i] = (spv_t) malloc (mpzspm->sp_num * sizeof (sp_t)); if (mpzspm->crt4[i] == NULL) goto error_clear_crt4; } for (i = 0; i < mpzspm->sp_num; i++) { p = mpzspm->spm[i]->sp; mpz_set_sp (mp, p); /* crt3[i] = (P / p)^{-1} mod p */ mpz_fdiv_q (T, P, mp); mpz_fdiv_r (mt, T, mp); a = mpz_get_sp (mt); mpzspm->crt3[i] = sp_inv (a, p, mpzspm->spm[i]->mul_c); /* crt1[i] = (P / p) mod modulus */ mpz_init (mpzspm->crt1[i]); mpz_mod (mpzspm->crt1[i], T, modulus); /* crt4[i][j] = ((P / p[i]) mod modulus) mod p[j] */ for (j = 0; j < mpzspm->sp_num; j++) { mpz_set_sp (mp, mpzspm->spm[j]->sp); mpz_fdiv_r (mt, mpzspm->crt1[i], mp); mpzspm->crt4[j][i] = mpz_get_sp (mt); } /* crt5[i] = (-P mod modulus) mod p */ mpz_mod (T, P, modulus); mpz_sub (T, modulus, T); mpz_set_sp (mp, p); mpz_fdiv_r (mt, T, mp); mpzspm->crt5[i] = mpz_get_sp (mt); } mpz_set_ui (T, 0); for (i = 0; i < mpzspm->sp_num + 2; i++) { mpz_mod (T, T, modulus); mpz_init_set (mpzspm->crt2[i], T); mpz_sub (T, T, P); } mpz_clear (mp); mpz_clear (mt); mpz_clear (P); mpz_clear (S); mpz_clear (T); mpzspm_product_tree_init (mpzspm); outputf (OUTPUT_DEVVERBOSE, "mpzspm_init took %lums\n", cputime() - st); return mpzspm; /* Error cases: free memory we allocated so far */ error_clear_crt4: for (i = 0; i < mpzspm->sp_num; i++) free (mpzspm->crt4[i]); error_clear_crt: free (mpzspm->crt1); free (mpzspm->crt2); free (mpzspm->crt3); free (mpzspm->crt4); free (mpzspm->crt5); error_clear_mpzspm_spm: for (i = 0; i < mpzspm->sp_num; i++) free(mpzspm->spm[i]); free (mpzspm->spm); error_clear_mpzspm: free (mpzspm); return NULL; } /* clear the product tree T */ static void mpzspm_product_tree_clear (mpzspm_t mpzspm) { unsigned int i, j; unsigned int n = mpzspm->sp_num; unsigned int d = mpzspm->d; mpzv_t *T = mpzspm->T; if (T == NULL) /* use the slow method */ return; for (i = 0; i <= d; i++) { for (j = 0; j < n; j++) mpz_clear (T[i][j]); free (T[i]); n = (n + 1) / 2; } free (T); } void mpzspm_clear (mpzspm_t mpzspm) { unsigned int i; mpzspm_product_tree_clear (mpzspm); for (i = 0; i < mpzspm->sp_num; i++) { mpz_clear (mpzspm->crt1[i]); free (mpzspm->crt4[i]); spm_clear (mpzspm->spm[i]); } for (i = 0; i < mpzspm->sp_num + 2; i++) mpz_clear (mpzspm->crt2[i]); free (mpzspm->crt1); free (mpzspm->crt2); free (mpzspm->crt3); free (mpzspm->crt4); free (mpzspm->crt5); mpz_clear (mpzspm->modulus); free (mpzspm->spm); free (mpzspm); } ecm-6.4.4/stage2.c0000644023561000001540000007015412106741273010556 00000000000000/* Common stage 2 for ECM, P-1 and P+1 (improved standard continuation with subquadratic polynomial arithmetic). Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include #include #include /* for floor */ #include /* for strlen */ #ifdef HAVE_UNISTD_H #include /* for unlink */ #endif #include "ecm-impl.h" #include "sp.h" extern unsigned int Fermat; /* r <- Dickson(n,a)(x) */ static void dickson (mpz_t r, mpz_t x, unsigned int n, int a) { unsigned int i, b = 0; mpz_t t, u; if (n == 0) { mpz_set_ui (r, 1); return; } while (n > 2 && (n & 1) == 0) { b++; n >>= 1; } mpz_set (r, x); MPZ_INIT (t); MPZ_INIT (u); if (n > 1) { mpz_set (r, x); mpz_mul (r, r, r); mpz_sub_si (r, r, a); mpz_sub_si (r, r, a); /* r = dickson(x, 2, a) */ mpz_set (t, x); /* t = dickson(x, 1, a) */ for (i = 2; i < n; i++) { mpz_mul_si (u, t, a); mpz_set (t, r); /* t = dickson(x, i, a) */ mpz_mul (r, r, x); mpz_sub (r, r, u); /* r = dickson(x, i+1, a) */ } } for ( ; b > 0; b--) { mpz_mul (t, r, r); /* t = dickson(x, n, a) ^ 2 */ mpz_ui_pow_ui (u, abs (a), n); if (n & 1 && a < 0) mpz_neg (u, u); mpz_mul_2exp (u, u, 1); /* u = 2 * a^n */ mpz_sub (r, t, u); /* r = dickson(x, 2*n, a) */ n <<= 1; } mpz_clear (t); mpz_clear (u); } /* Init table to allow computation of Dickson_{E, a} (s + n*D), for successive n, where Dickson_{E, a} is the Dickson polynomial of degree E with parameter a. For a == 0, Dickson_{E, a} (x) = x^E . See Knuth, TAOCP vol.2, 4.6.4 and exercise 7 in 4.6.4, and "An FFT Extension of the Elliptic Curve Method of Factorization", Peter Montgomery, Dissertation, 1992, Chapter 5. Ternary return value. */ static void fin_diff_coeff (listz_t coeffs, mpz_t s, mpz_t D, unsigned int E, int dickson_a) { unsigned int i, k; mpz_t t; MPZ_INIT (t); mpz_set (t, s); for (i = 0; i <= E; i++) { if (dickson_a != 0) /* fd[i] = dickson_{E,a} (s+i*D) */ dickson (coeffs[i], t, E, dickson_a); else /* fd[i] = (s+i*D)^E */ mpz_pow_ui (coeffs[i], t, E); mpz_add (t, t, D); /* t = s + i * D */ } for (k = 1; k <= E; k++) for (i = E; i >= k; i--) mpz_sub (coeffs[i], coeffs[i], coeffs[i-1]); mpz_clear (t); } /* Init several disjoint progressions for the computation of Dickson_{E,a} (e * (i0 + i + n * d * k)), for 0 <= i < d * k (1) with gcd(e * (i0 + i), d) == 1, i == 1 (mod m), where m divides d for successive n (the variable n does not appear here, it is the application that called this function that wants to evaluate (1) for n = 0, 1, 2, ... This means there will be k sets of progressions, where each set contains eulerphi(d) progressions that generate the values of Dickson_{E,a} (x) with x coprime to d and with i == 1 (mod m), where x == e * (i0 + i) (mod m). i0 may be a NULL pointer, in this case i0 = 0 is assumed. Return NULL if an error occurred. */ listz_t init_progression_coeffs (mpz_t i0, const unsigned long d, const unsigned long e, const unsigned int k, const unsigned int m, const unsigned int E, const int dickson_a) { unsigned int i, j, size_fd; mpz_t t, dke, em; listz_t fd; ASSERT (d % m == 0); size_fd = k * (eulerphi(d) / eulerphi(m)) * (E + 1); fd = (listz_t) malloc (size_fd * sizeof (mpz_t)); if (fd == NULL) return NULL; for (i = 0; i < size_fd; i++) MPZ_INIT (fd[i]); MPZ_INIT (t); if (i0 != NULL) mpz_set (t, i0); outputf (OUTPUT_TRACE, "init_progression_coeffs: i0 = %Zd, d = %u, e = %u, " "k = %u, m = %u, E = %u, a = %d, size_fd = %u\n", t, d, e, k, m, E, dickson_a, size_fd); /* Due to the condition i == 1 (mod m) we start at i = 1 or i = 0, depending on whether m > 1 or m == 1 */ i = (m > 1) ? 1 : 0; mpz_add_ui (t, t, (unsigned long) i); mpz_mul_ui (t, t, e); /* Now t = e * (i0 + i + n * d * k), for n = 0 */ /* dke = d * k * e, the common difference of the arithmetic progressions (it is the same for all arithmetic progressions we initialise) */ MPZ_INIT (dke); mpz_set_ui (dke, d); mpz_mul_ui (dke, dke, k); mpz_mul_ui (dke, dke, e); /* em = e * m, the value by which t advances if we increase i by m */ MPZ_INIT (em); mpz_set_ui (em, e); mpz_mul_ui (em, em, (unsigned long) m); for (j = 0; i < k * d; i += m) { if (mpz_gcd_ui (NULL, t, d) == 1) { outputf (OUTPUT_TRACE, "init_progression_coeffs: initing a " "progression for Dickson_{%d,%d}(%Zd + n * %Zd)\n", E, dickson_a, t, dke); /* Initialise for the evaluation of Dickson_{E,a} (t + n*dke) for n = 0, 1, 2, ... */ fin_diff_coeff (fd + j, t, dke, E, dickson_a); j += E + 1; } else if (test_verbose (OUTPUT_TRACE)) outputf (OUTPUT_TRACE, "init_progression_coeffs: NOT initing a " "progression for Dickson_{%d,%d}(%Zd + n * %Zd), " "gcd (%Zd, %u) == %u)\n", E, dickson_a, t, dke, t, d, mpz_gcd_ui (NULL, t, d)); /* We increase i by m, so we increase t by e*m */ mpz_add (t, t, em); } mpz_clear (em); mpz_clear (dke); mpz_clear (t); return fd; } void init_roots_params (progression_params_t *params, const int S, const unsigned long d1, const unsigned long d2, const double cost) { ASSERT (gcd (d1, d2) == 1); /* If S < 0, use degree |S| Dickson poly, otherwise use x^S */ params->S = abs (S); params->dickson_a = (S < 0) ? -1 : 0; /* We only calculate Dickson_{S, a}(j * d2) * s where gcd (j, dsieve) == 1 and j == 1 (mod 6) by doing nr = eulerphi(dsieve)/2 separate progressions. */ /* Now choose a value for dsieve. */ params->dsieve = 6; params->nr = 1; /* Prospective saving by sieving out multiples of 5: d1 / params->dsieve * params->nr / 5 roots, each one costs S point adds Prospective cost increase: 4 times as many progressions to init (that is, 3 * params->nr more), each costs ~ S * S * log_2(5 * dsieve * d2) / 2 point adds The params->nr and one S cancel. */ if (d1 % 5 == 0 && d1 / params->dsieve / 5. * cost > 3. * params->S * log (5. * params->dsieve * d2) / 2.) { params->dsieve *= 5; params->nr *= 4; } if (d1 % 7 == 0 && d1 / params->dsieve / 7. * cost > 5. * params->S * log (7. * params->dsieve * d2) / 2.) { params->dsieve *= 7; params->nr *= 6; } if (d1 % 11 == 0 && d1 / params->dsieve / 11. * cost > 9. * params->S * log (11. * params->dsieve * d2) / 2.) { params->dsieve *= 11; params->nr *= 10; } params->size_fd = params->nr * (params->S + 1); params->next = 0; params->rsieve = 1; } double memory_use (unsigned long dF, unsigned int sp_num, unsigned int Ftreelvl, mpmod_t modulus) { double mem; /* printf ("memory_use (%lu, %d, %d, )\n", dF, sp_num, Ftreelvl); */ mem = 9.0; /* F:1, T:3*2, invF:1, G:1 */ mem += (double) Ftreelvl; mem *= (double) dF; mem += 2. * list_mul_mem (dF); /* Also in T */ #if (MULT == KS) /* estimated memory for kronecker_schonhage / wrap-case in PrerevertDivision respectively */ mem += (24.0 + 1.0) * (double) (sp_num ? MIN(MUL_NTT_THRESHOLD, dF) : dF); #endif mem *= (double) (mpz_size (modulus->orig_modulus)) * sizeof (mp_limb_t) + sizeof (mpz_t); if (sp_num) mem += /* peak malloc in ecm_ntt.c */ (4.0 * dF * sp_num * sizeof (sp_t)) /* mpzspv_normalise */ + (MPZSPV_NORMALISE_STRIDE * ((double) sp_num * sizeof (sp_t) + 6.0 * sizeof (sp_t) + sizeof (float))) /* sp_F, sp_invF */ + ((1.0 + 2.0) * dF * sp_num * sizeof (sp_t)); return mem; } /* Input: X is the point at end of stage 1 n is the number to factor B2min-B2 is the stage 2 range (we consider B2min is done) k0 is the number of blocks (if 0, use default) S is the exponent for Brent-Suyama's extension invtrick is non-zero iff one uses x+1/x instead of x. method: ECM_ECM, ECM_PM1 or ECM_PP1 Cf "Speeding the Pollard and Elliptic Curve Methods of Factorization", Peter Montgomery, Math. of Comp., 1987, page 257: using x^(i^e)+1/x^(i^e) instead of x^(i^(2e)) reduces the cost of Brent-Suyama's extension from 2*e to e+3 multiplications per value of i. Output: f is the factor found Return value: 2 (step number) iff a factor was found, or ECM_ERROR if an error occurred. */ int stage2 (mpz_t f, void *X, mpmod_t modulus, unsigned long dF, unsigned long k, root_params_t *root_params, int method, int use_ntt, char *TreeFilename, int (*stop_asap)(void)) { unsigned long i, sizeT; mpz_t n; listz_t F, G, H, T; int youpi = ECM_NO_FACTOR_FOUND; long st, st0; void *rootsG_state = NULL; listz_t *Tree = NULL; /* stores the product tree for F */ unsigned int treefiles_used = 0; /* Number of tree files currently in use */ unsigned int lgk; /* ceil(log(k)/log(2)) */ listz_t invF = NULL; double mem; mpzspm_t mpzspm = NULL; mpzspv_t sp_F = NULL, sp_invF = NULL; /* check alloc. size of f */ mpres_realloc (f, modulus); st0 = cputime (); Fermat = 0; if (modulus->repr == ECM_MOD_BASE2 && modulus->Fermat > 0) { Fermat = modulus->Fermat; use_ntt = 0; /* don't use NTT for Fermat numbers */ } if (use_ntt) { mpzspm = mpzspm_init (2 * dF, modulus->orig_modulus); if (mpzspm == NULL) { outputf (OUTPUT_ERROR, "Could not initialise mpzspm, " "presumably out of memory\n"); return ECM_ERROR; } outputf (OUTPUT_VERBOSE, "Using %u small primes for NTT\n", mpzspm->sp_num); } lgk = ceil_log2 (dF); mem = memory_use (dF, use_ntt ? mpzspm->sp_num : 0, (TreeFilename == NULL) ? lgk : 0, modulus); if (mem < 1e4) outputf (OUTPUT_VERBOSE, "Estimated memory usage: %1.0f\n", mem); else if (mem < 1e7) outputf (OUTPUT_VERBOSE, "Estimated memory usage: %1.0fK\n", mem / 1024.); else if (mem < 1e10) outputf (OUTPUT_VERBOSE, "Estimated memory usage: %1.0fM\n", mem / 1048576.); else outputf (OUTPUT_VERBOSE, "Estimated memory usage: %1.0fG\n", mem / 1073741824.); MEMORY_TAG; F = init_list2 (dF + 1, mpz_sizeinbase (modulus->orig_modulus, 2) + 3 * GMP_NUMB_BITS); MEMORY_UNTAG; if (F == NULL) { youpi = ECM_ERROR; goto clear_i0; } sizeT = 3 * dF + list_mul_mem (dF); if (dF > 3) sizeT += dF; MEMORY_TAG; T = init_list2 (sizeT, 2 * mpz_sizeinbase (modulus->orig_modulus, 2) + 3 * GMP_NUMB_BITS); MEMORY_UNTAG; if (T == NULL) { youpi = ECM_ERROR; goto clear_F; } H = T; /* needs dF+1 cells in T */ if (method == ECM_PM1) youpi = pm1_rootsF (f, F, root_params, dF, (mpres_t*) X, T, modulus); else if (method == ECM_PP1) youpi = pp1_rootsF (F, root_params, dF, (mpres_t*) X, T, modulus); else youpi = ecm_rootsF (f, F, root_params, dF, (curve*) X, modulus); if (youpi != ECM_NO_FACTOR_FOUND) { if (youpi != ECM_ERROR) youpi = ECM_FACTOR_FOUND_STEP2; goto clear_T; } if (stop_asap != NULL && (*stop_asap)()) goto clear_T; if (test_verbose (OUTPUT_TRACE)) { unsigned long j; for (j = 0; j < dF; j++) outputf (OUTPUT_TRACE, "f_%lu = %Zd\n", j, F[j]); } /* ---------------------------------------------- | F | invF | G | T | ---------------------------------------------- | rootsF | ??? | ??? | ??? | ---------------------------------------------- */ if (TreeFilename == NULL) { Tree = (listz_t*) malloc (lgk * sizeof (listz_t)); if (Tree == NULL) { outputf (OUTPUT_ERROR, "Error: not enough memory\n"); youpi = ECM_ERROR; goto clear_T; } for (i = 0; i < lgk; i++) { MEMORY_TAG; Tree[i] = init_list2 (dF, mpz_sizeinbase (modulus->orig_modulus, 2) + GMP_NUMB_BITS); MEMORY_UNTAG; if (Tree[i] == NULL) { /* clear already allocated Tree[i] */ while (i) clear_list (Tree[--i], dF); free (Tree); youpi = ECM_ERROR; goto clear_T; } } } else Tree = NULL; #ifdef TELLEGEN_DEBUG outputf (OUTPUT_ALWAYS, "Roots = "); print_list (os, F, dF); #endif mpz_init_set (n, modulus->orig_modulus); st = cputime (); if (TreeFilename != NULL) { FILE *TreeFile; char *fullname = (char *) malloc (strlen (TreeFilename) + 1 + 2 + 1); if (fullname == NULL) { fprintf (stderr, "Cannot allocate memory in stage2\n"); exit (1); } for (i = lgk; i > 0; i--) { if (stop_asap != NULL && (*stop_asap)()) goto free_Tree_i; sprintf (fullname, "%s.%lu", TreeFilename, i - 1); TreeFile = fopen (fullname, "wb"); if (TreeFile == NULL) { outputf (OUTPUT_ERROR, "Error opening file for product tree of F\n"); youpi = ECM_ERROR; goto free_Tree_i; } treefiles_used++; if (use_ntt) { if (ntt_PolyFromRoots_Tree (F, F, dF, T, i - 1, mpzspm, NULL, TreeFile) == ECM_ERROR) { fclose (TreeFile); youpi = ECM_ERROR; goto free_Tree_i; } } else { if (PolyFromRoots_Tree (F, F, dF, T, i - 1, n, NULL, TreeFile, 0) == ECM_ERROR) { fclose (TreeFile); youpi = ECM_ERROR; goto free_Tree_i; } } if (fclose (TreeFile) != 0) { youpi = ECM_ERROR; goto free_Tree_i; } } free (fullname); } else { /* TODO: how to check for stop_asap() here? */ if (use_ntt) ntt_PolyFromRoots_Tree (F, F, dF, T, -1, mpzspm, Tree, NULL); else PolyFromRoots_Tree (F, F, dF, T, -1, n, Tree, NULL, 0); } if (test_verbose (OUTPUT_TRACE)) { unsigned long j; for (j = 0; j < dF; j++) outputf (OUTPUT_TRACE, "F[%lu] = %Zd\n", j, F[j]); } outputf (OUTPUT_VERBOSE, "Building F from its roots took %ldms\n", elltime (st, cputime ())); if (stop_asap != NULL && (*stop_asap)()) goto free_Tree_i; /* needs dF+list_mul_mem(dF/2) cells in T */ mpz_set_ui (F[dF], 1); /* the leading monic coefficient needs to be stored explicitly for PrerevertDivision */ /* ---------------------------------------------- | F | invF | G | T | ---------------------------------------------- | F(x) | ??? | ??? | ??? | ---------------------------------------------- */ /* G*H has degree 2*dF-2, hence we must cancel dF-1 coefficients to get degree dF-1 */ if (dF > 1) { /* only dF-1 coefficients of 1/F are needed to reduce G*H, but we need one more for TUpTree */ MEMORY_TAG; invF = init_list2 (dF + 1, mpz_sizeinbase (modulus->orig_modulus, 2) + 2 * GMP_NUMB_BITS); MEMORY_UNTAG; if (invF == NULL) { youpi = ECM_ERROR; goto free_Tree_i; } st = cputime (); if (use_ntt) { sp_F = mpzspv_init (dF, mpzspm); mpzspv_from_mpzv (sp_F, 0, F, dF, mpzspm); mpzspv_to_ntt (sp_F, 0, dF, dF, 1, mpzspm); ntt_PolyInvert (invF, F + 1, dF, T, mpzspm); sp_invF = mpzspv_init (2 * dF, mpzspm); mpzspv_from_mpzv (sp_invF, 0, invF, dF, mpzspm); mpzspv_to_ntt (sp_invF, 0, dF, 2 * dF, 0, mpzspm); } else PolyInvert (invF, F + 1, dF, T, n); /* now invF[0..dF-1] = Quo(x^(2dF-1), F) */ outputf (OUTPUT_VERBOSE, "Computing 1/F took %ldms\n", elltime (st, cputime ())); /* ---------------------------------------------- | F | invF | G | T | ---------------------------------------------- | F(x) | 1/F(x) | ??? | ??? | ---------------------------------------------- */ } if (stop_asap != NULL && (*stop_asap)()) goto clear_invF; /* start computing G with roots at i0*d, (i0+1)*d, (i0+2)*d, ... where i0*d <= B2min < (i0+1)*d */ MEMORY_TAG; G = init_list2 (dF, mpz_sizeinbase (modulus->orig_modulus, 2) + 3 * GMP_NUMB_BITS); MEMORY_UNTAG; if (G == NULL) { youpi = ECM_ERROR; goto clear_invF; } st = cputime (); if (method == ECM_PM1) rootsG_state = pm1_rootsG_init ((mpres_t *) X, root_params, modulus); else if (method == ECM_PP1) rootsG_state = pp1_rootsG_init ((mpres_t *) X, root_params, modulus); else /* ECM_ECM */ rootsG_state = ecm_rootsG_init (f, (curve *) X, root_params, dF, k, modulus); /* rootsG_state=NULL if an error occurred or (ecm only) a factor was found */ if (rootsG_state == NULL) { /* ecm: f = -1 if an error occurred */ youpi = (method == ECM_ECM && mpz_cmp_si (f, -1)) ? ECM_FACTOR_FOUND_STEP2 : ECM_ERROR; goto clear_G; } if (method != ECM_ECM) /* ecm_rootsG_init prints itself */ outputf (OUTPUT_VERBOSE, "Initializing table of differences for G " "took %ldms\n", elltime (st, cputime ())); if (stop_asap != NULL && (*stop_asap)()) goto clear_fd; for (i = 0; i < k; i++) { /* needs dF+1 cells in T+dF */ if (method == ECM_PM1) youpi = pm1_rootsG (f, G, dF, (pm1_roots_state_t *) rootsG_state, T + dF, modulus); else if (method == ECM_PP1) youpi = pp1_rootsG (G, dF, (pp1_roots_state_t *) rootsG_state, modulus, (mpres_t *) X); else youpi = ecm_rootsG (f, G, dF, (ecm_roots_state_t *) rootsG_state, modulus); if (test_verbose (OUTPUT_TRACE)) { unsigned long j; for (j = 0; j < dF; j++) outputf (OUTPUT_TRACE, "g_%lu = %Zd\n", j, G[j]); } ASSERT(youpi != ECM_ERROR); /* xxx_rootsG cannot fail */ if (youpi) /* factor found */ { youpi = ECM_FACTOR_FOUND_STEP2; goto clear_fd; } if (stop_asap != NULL && (*stop_asap)()) goto clear_fd; /* ----------------------------------------------- | F | invF | G | T | ----------------------------------------------- | F(x) | 1/F(x) | rootsG | ??? | ----------------------------------------------- */ st = cputime (); if (use_ntt) ntt_PolyFromRoots (G, G, dF, T + dF, mpzspm); else PolyFromRoots (G, G, dF, T + dF, n); if (test_verbose (OUTPUT_TRACE)) { unsigned long j; outputf (OUTPUT_TRACE, "G(x) = x^%lu ", dF); for (j = 0; j < dF; j++) outputf (OUTPUT_TRACE, "+ (%Zd * x^%lu)", G[j], j); outputf (OUTPUT_TRACE, "\n"); } /* needs 2*dF+list_mul_mem(dF/2) cells in T */ outputf (OUTPUT_VERBOSE, "Building G from its roots took %ldms\n", elltime (st, cputime ())); if (stop_asap != NULL && (*stop_asap)()) goto clear_fd; /* ----------------------------------------------- | F | invF | G | T | ----------------------------------------------- | F(x) | 1/F(x) | G(x) | ??? | ----------------------------------------------- */ if (i == 0) { list_sub (H, G, F, dF); /* coefficients 1 of degree cancel, thus T is of degree < dF */ list_mod (H, H, dF, n); /* ------------------------------------------------ | F | invF | G | T | ------------------------------------------------ | F(x) | 1/F(x) | ??? |G(x)-F(x)| ??? | ------------------------------------------------ */ } else { /* since F and G are monic of same degree, G mod F = G - F */ list_sub (G, G, F, dF); list_mod (G, G, dF, n); /* ------------------------------------------------ | F | invF | G | T | ------------------------------------------------ | F(x) | 1/F(x) |G(x)-F(x)| H(x) | | ------------------------------------------------ */ st = cputime (); /* previous G mod F is in H, with degree < dF, i.e. dF coefficients: requires 3dF-1+list_mul_mem(dF) cells in T */ if (use_ntt) { ntt_mul (T + dF, G, H, dF, T + 3 * dF, 0, mpzspm); list_mod (H, T + dF, 2 * dF, n); } else list_mulmod (H, T + dF, G, H, dF, T + 3 * dF, n); outputf (OUTPUT_VERBOSE, "Computing G * H took %ldms\n", elltime (st, cputime ())); if (stop_asap != NULL && (*stop_asap)()) goto clear_fd; /* ------------------------------------------------ | F | invF | G | T | ------------------------------------------------ | F(x) | 1/F(x) |G(x)-F(x)| G * H | | ------------------------------------------------ */ st = cputime (); if (use_ntt) { ntt_PrerevertDivision (H, F, invF + 1, sp_F, sp_invF, dF, T + 2 * dF, mpzspm); } else { if (PrerevertDivision (H, F, invF + 1, dF, T + 2 * dF, n)) { youpi = ECM_ERROR; goto clear_fd; } } outputf (OUTPUT_VERBOSE, "Reducing G * H mod F took %ldms\n", elltime (st, cputime ())); if (stop_asap != NULL && (*stop_asap)()) goto clear_fd; } } clear_list (F, dF + 1); F = NULL; clear_list (G, dF); G = NULL; st = cputime (); #ifdef POLYEVALTELLEGEN if (use_ntt) youpi = ntt_polyevalT (T, dF, Tree, T + dF + 1, sp_invF, mpzspm, TreeFilename); else youpi = polyeval_tellegen (T, dF, Tree, T + dF + 1, sizeT - dF - 1, invF, n, TreeFilename); if (youpi) { outputf (OUTPUT_ERROR, "Error, not enough memory\n"); goto clear_fd; } #else clear_list (invF, dF + 1); invF = NULL; polyeval (T, dF, Tree, T + dF + 1, n, 0); #endif treefiles_used = 0; /* Polyeval deletes treefiles by itself */ if (test_verbose (OUTPUT_TRACE)) { unsigned long j; for (j = 0; j < dF; j++) outputf (OUTPUT_TRACE, "G(x_%lu) = %Zd\n", j, T[j]); } outputf (OUTPUT_VERBOSE, "Computing polyeval(F,G) took %ldms\n", elltime (st, cputime ())); st = cputime (); list_mulup (T, dF, n, T[dF]); outputf (OUTPUT_VERBOSE, "Computing product of all F(g_i) took %ldms\n", elltime (st, cputime ())); mpz_gcd (f, T[dF - 1], n); if (mpz_cmp_ui (f, 1) > 0) { youpi = ECM_FACTOR_FOUND_STEP2; if (method == ECM_ECM && test_verbose (OUTPUT_RESVERBOSE)) { /* Find out for which i*X, (i,d)==1, a factor was found */ /* Note that the factor we found may be composite */ /* TBD: use binary search */ unsigned long j, k; mpz_set (T[dF], f); for (k = 0, j = 1; k < dF; j += 6) { if (gcd (j, root_params->d1) > 1) continue; mpz_gcd (T[dF + 1], T[k], T[dF]); if (mpz_cmp_ui (T[dF + 1], 1) > 0) { int sgn; /* Find i so that $f(i d1) X = +-f(j d2) X$ over GF(f) */ sgn = ecm_findmatch (&i, j, root_params, (curve *)X, modulus, f); if (sgn != 0) { mpz_add_ui (T[dF + 2], root_params->i0, i); outputf (OUTPUT_RESVERBOSE, "Divisor %Zd first occurs in T[%lu] = " "((f(%Zd*%lu)%cf(%lu*%lu))*X)_x\n", T[dF + 1], k, T[dF + 2], root_params->d1, sgn < 0 ? '+' : '-', j, root_params->d2); mpz_mul_ui (T[dF + 2], T[dF + 2], root_params->d1); if (sgn < 0) mpz_add_ui (T[dF + 2], T[dF + 2], j * root_params->d2); else mpz_sub_ui (T[dF + 2], T[dF + 2], j * root_params->d2); mpz_abs (T[dF + 2], T[dF + 2]); outputf (OUTPUT_RESVERBOSE, "Maybe largest group order " "factor is or divides %Zd\n", T[dF + 2]); } else { outputf (OUTPUT_RESVERBOSE, "Divisor %Zd first occurs in T[%lu], but could " "not determine associated i\n", T[dF + 1], k); } /* Don't report this divisor again */ mpz_divexact (T[dF], T[dF], T[dF + 1]); } k++; } } } else { /* Here, mpz_cmp_ui (f, 1) == 0, i.e. no factor was found */ outputf (OUTPUT_RESVERBOSE, "Product of G(f_i) = %Zd\n", T[0]); } clear_fd: if (method == ECM_PM1) pm1_rootsG_clear ((pm1_roots_state_t *) rootsG_state, modulus); else if (method == ECM_PP1) pp1_rootsG_clear ((pp1_roots_state_t *) rootsG_state, modulus); else /* ECM_ECM */ ecm_rootsG_clear ((ecm_roots_state_t *) rootsG_state, modulus); clear_G: clear_list (G, dF); clear_invF: clear_list (invF, dF + 1); if (use_ntt) { mpzspv_clear (sp_F, mpzspm); mpzspv_clear (sp_invF, mpzspm); } free_Tree_i: if (Tree != NULL) { for (i = 0; i < lgk; i++) clear_list (Tree[i], dF); free (Tree); } if (TreeFilename != NULL && treefiles_used > 0) { /* Unlink any treefiles still in use */ char *fullname = (char *) malloc (strlen (TreeFilename) + 1 + 2 + 1); if (fullname == NULL) { fprintf (stderr, "Cannot allocate memory in stage2\n"); exit (1); } for (i = 0; i < treefiles_used; i++) { sprintf (fullname, "%s.%lu", TreeFilename, i); outputf (OUTPUT_DEVVERBOSE, "Unlinking %s\n", fullname); if (unlink (fullname) != 0) outputf (OUTPUT_ERROR, "Could not delete %s\n", fullname); } free (fullname); } mpz_clear (n); clear_T: clear_list (T, sizeT); clear_F: clear_list (F, dF + 1); clear_i0: if (use_ntt) mpzspm_clear (mpzspm); if (Fermat) F_clear (); if (stop_asap == NULL || !(*stop_asap)()) { st0 = elltime (st0, cputime ()); outputf (OUTPUT_NORMAL, "Step 2 took %ldms\n", st0); } return youpi; } ecm-6.4.4/longlong.h0000644023561000001540000017574712106741273011233 00000000000000/* longlong.h -- definitions for mixed size 32/64 bit arithmetic. Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc. This file was copied from the GNU MP Library. This file is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this file. If not, see http://www.gnu.org/licenses/. */ /* added for compatibility with other compilers than gcc */ #if !defined(__GNUC__) #define __builtin_constant_p(x) 0 #endif /* You have to define the following before including this file: UWtype -- An unsigned type, default type for operations (typically a "word") UHWtype -- An unsigned type, at least half the size of UWtype. UDWtype -- An unsigned type, at least twice as large a UWtype W_TYPE_SIZE -- size in bits of UWtype SItype, USItype -- Signed and unsigned 32 bit types. DItype, UDItype -- Signed and unsigned 64 bit types. On a 32 bit machine UWtype should typically be USItype; on a 64 bit machine, UWtype should typically be UDItype. */ #define __BITS4 (W_TYPE_SIZE / 4) #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) /* This is used to make sure no undesirable sharing between different libraries that use this file takes place. */ #ifndef __MPN #define __MPN(x) __##x #endif #ifndef _PROTO #if (__STDC__-0) || defined (__cplusplus) || defined( _MSC_VER ) #define _PROTO(x) x #else #define _PROTO(x) () #endif #endif /* Define auxiliary asm macros. 1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype word product in HIGH_PROD and LOW_PROD. 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a UDWtype product. This is just a variant of umul_ppmm. 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator) divides a UDWtype, composed by the UWtype integers HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less than DENOMINATOR for correct operation. If, in addition, the most significant bit of DENOMINATOR must be 1, then the pre-processor symbol UDIV_NEEDS_NORMALIZATION is defined to 1. 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator). Like udiv_qrnnd but the numbers are signed. The quotient is rounded towards 0. 5) count_leading_zeros(count, x) counts the number of zero-bits from the msb to the first non-zero bit in the UWtype X. This is the number of steps X needs to be shifted left to set the msb. Undefined for X == 0, unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts from the least significant end. 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, high_addend_2, low_addend_2) adds two UWtype integers, composed by HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow (i.e. carry out) is not stored anywhere, and is lost. 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, and is lost. If any of these macros are left undefined for a particular CPU, C macros are used. Notes: For add_ssaaaa the two high and two low addends can both commute, but unfortunately gcc only supports one "%" commutative in each asm block. This has always been so but is only documented in recent versions (eg. pre-release 3.3). Having two or more "%"s can cause an internal compiler error in certain rare circumstances. Apparently it was only the last "%" that was ever actually respected, so the code has been updated to leave just that. Clearly there's a free choice whether high or low should get it, if there's a reason to favour one over the other. Also obviously when the constraints on the two operands are identical there's no benefit to the reloader in any "%" at all. */ /* The CPUs come in alphabetical order below. Please add support for more CPUs here, or improve the current support for the CPUs below! */ /* FIXME: The macros using external routines like __MPN(count_leading_zeros) don't need to be under !NO_ASM */ #if ! defined (NO_ASM) #if defined (__alpha) && W_TYPE_SIZE == 64 /* Most alpha-based machines, except Cray systems. */ #if defined (__GNUC__) #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ __asm__ ("umulh %r1,%2,%0" \ : "=r" (ph) \ : "%rJ" (m0), "rI" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 18 #else /* ! __GNUC__ */ #include #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ (ph) = __UMULH (m0, m1); \ (pl) = __m0 * __m1; \ } while (0) #endif #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __di; \ __di = __MPN(invert_limb) (d); \ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ } while (0) #define UDIV_PREINV_ALWAYS 1 #define UDIV_NEEDS_NORMALIZATION 1 #define UDIV_TIME 220 #endif /* LONGLONG_STANDALONE */ /* clz_tab is required by mpn/alpha/cntlz.asm, and that file is built for all alphas, even though ev67 and ev68 don't need it. */ #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #if defined (__GNUC__) && \ (defined(HAVE_HOST_CPU_alphaev67) && HAVE_HOST_CPU_alphaev67 || \ defined(HAVE_HOST_CPU_alphaev68) && HAVE_HOST_CPU_alphaev68) #define count_leading_zeros(COUNT,X) \ __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) #define count_trailing_zeros(COUNT,X) \ __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) #else /* ! (ev67 || ev68) */ #ifndef LONGLONG_STANDALONE #if defined(HAVE_ATTRIBUTE_CONST) && HAVE_ATTRIBUTE_CONST long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const)); #else long __MPN(count_leading_zeros) _PROTO ((UDItype)); #endif #define count_leading_zeros(count, x) \ ((count) = __MPN(count_leading_zeros) (x)) #endif /* LONGLONG_STANDALONE */ #endif /* ! (ev67 || ev68) */ #endif /* __alpha */ #if defined (_CRAY) && W_TYPE_SIZE == 64 #include #define UDIV_PREINV_ALWAYS 1 #define UDIV_NEEDS_NORMALIZATION 1 #define UDIV_TIME 220 long __MPN(count_leading_zeros) _PROTO ((UDItype)); #define count_leading_zeros(count, x) \ ((count) = _leadz ((UWtype) (x))) #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ (ph) = _int_mult_upper (m0, m1); \ (pl) = __m0 * __m1; \ } while (0) #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __di; \ __di = __MPN(invert_limb) (d); \ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ } while (0) #endif /* LONGLONG_STANDALONE */ #endif /* _CRAYIEEE */ #endif /* _CRAY */ #if defined (__hppa) && W_TYPE_SIZE == 64 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC (3.2) puts longlong into two adjacent 32-bit registers. Presumably this is just a case of no direct support for 2.0n but treating it like 1.0. */ #if defined (__GNUC__) && ! defined (_LONG_LONG_LIMB) #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add %4,%5,%1\n\tadd,dc %2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub %4,%5,%1\n\tsub,db %2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl)) #endif /* We put the result pointer parameter last here, since it makes passing of the other parameters more efficient. */ #ifndef LONGLONG_STANDALONE #define umul_ppmm(wh, wl, u, v) \ do { \ UWtype __p0; \ (wh) = __MPN(umul_ppmm) (u, v, &__p0); \ (wl) = __p0; \ } while (0) extern UWtype __MPN(umul_ppmm) _PROTO ((UWtype, UWtype, UWtype *)); #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (n1, n0, d, &__r); \ (r) = __r; \ } while (0) extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype, UWtype, UWtype, UWtype *)); #define UMUL_TIME 8 #define UDIV_TIME 60 #endif /* LONGLONG_STANDALONE */ #endif /* hppa */ #if defined (__ia64) && W_TYPE_SIZE == 64 #if defined (__GNUC__) #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ __asm__ ("xma.hu %0 = %1, %2, f0" \ : "=f" (ph) \ : "f" (m0), "f" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 14 #define count_leading_zeros(count, x) \ do { \ UWtype _x = (x), _y, _a, _c; \ __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ _c = (_a - 1) << 3; \ _x >>= _c; \ if (_x >= 1 << 4) \ _x >>= 4, _c += 4; \ if (_x >= 1 << 2) \ _x >>= 2, _c += 2; \ _c += _x >> 1; \ (count) = W_TYPE_SIZE - 1 - _c; \ } while (0) #endif #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __di; \ __di = __MPN(invert_limb) (d); \ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ } while (0) #define UDIV_PREINV_ALWAYS 1 #define UDIV_NEEDS_NORMALIZATION 1 #endif #define UDIV_TIME 220 #endif #if defined (__GNUC__) /* We sometimes need to clobber "cc" with gcc2, but that would not be understood by gcc1. Use cpp to avoid major code duplication. */ #if __GNUC__ < 2 #define __CLOBBER_CC #define __AND_CLOBBER_CC #else /* __GNUC__ >= 2 */ #define __CLOBBER_CC : "cc" #define __AND_CLOBBER_CC , "cc" #endif /* __GNUC__ < 2 */ #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) #define umul_ppmm(xh, xl, m0, m1) \ do { \ USItype __m0 = (m0), __m1 = (m1); \ __asm__ ("multiplu %0,%1,%2" \ : "=r" (xl) \ : "r" (__m0), "r" (__m1)); \ __asm__ ("multmu %0,%1,%2" \ : "=r" (xh) \ : "r" (__m0), "r" (__m1)); \ } while (0) #define udiv_qrnnd(q, r, n1, n0, d) \ __asm__ ("dividu %0,%3,%4" \ : "=r" (q), "=q" (r) \ : "1" (n1), "r" (n0), "r" (d)) #define count_leading_zeros(count, x) \ __asm__ ("clz %0,%1" \ : "=r" (count) \ : "r" (x)) #define COUNT_LEADING_ZEROS_0 32 #endif /* __a29k__ */ #if defined (__arc__) #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ : "=r" ((USItype) (sh)), \ "=&r" ((USItype) (sl)) \ : "r" ((USItype) (ah)), \ "rIJ" ((USItype) (bh)), \ "%r" ((USItype) (al)), \ "rIJ" ((USItype) (bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ : "=r" ((USItype) (sh)), \ "=&r" ((USItype) (sl)) \ : "r" ((USItype) (ah)), \ "rIJ" ((USItype) (bh)), \ "r" ((USItype) (al)), \ "rIJ" ((USItype) (bl))) #endif #if defined (__arm__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (al)) \ { \ if (__builtin_constant_p (ah)) \ __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ : "=r" (sh), "=&r" (sl) \ : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ else \ __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ } \ else if (__builtin_constant_p (ah)) \ { \ if (__builtin_constant_p (bl)) \ __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ : "=r" (sh), "=&r" (sl) \ : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ else \ __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ : "=r" (sh), "=&r" (sl) \ : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ } \ else if (__builtin_constant_p (bl)) \ { \ if (__builtin_constant_p (bh)) \ __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ else \ __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ : "=r" (sh), "=&r" (sl) \ : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ } \ else /* only bh might be a constant */ \ __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ } while (0) #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */ #define umul_ppmm(xh, xl, a, b) \ __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) #define UMUL_TIME 5 #define smul_ppmm(xh, xl, a, b) \ __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __di; \ __di = __MPN(invert_limb) (d); \ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ } while (0) #define UDIV_PREINV_ALWAYS 1 #define UDIV_NEEDS_NORMALIZATION 1 #define UDIV_TIME 70 #endif /* LONGLONG_STANDALONE */ #else #define umul_ppmm(xh, xl, a, b) \ __asm__ ("%@ Inlined umul_ppmm\n" \ " mov %|r0, %2, lsr #16\n" \ " mov %|r2, %3, lsr #16\n" \ " bic %|r1, %2, %|r0, lsl #16\n" \ " bic %|r2, %3, %|r2, lsl #16\n" \ " mul %1, %|r1, %|r2\n" \ " mul %|r2, %|r0, %|r2\n" \ " mul %|r1, %0, %|r1\n" \ " mul %0, %|r0, %0\n" \ " adds %|r1, %|r2, %|r1\n" \ " addcs %0, %0, #65536\n" \ " adds %1, %1, %|r1, lsl #16\n" \ " adc %0, %0, %|r1, lsr #16" \ : "=&r" (xh), "=r" (xl) \ : "r" (a), "r" (b) \ : "r0", "r1", "r2") #define UMUL_TIME 20 #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #define UDIV_TIME 200 #endif /* LONGLONG_STANDALONE */ #endif #endif /* __arm__ */ #if defined (__clipper__) && W_TYPE_SIZE == 32 #define umul_ppmm(w1, w0, u, v) \ ({union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __x; \ __asm__ ("mulwux %2,%0" \ : "=r" (__x.__ll) \ : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) #define smul_ppmm(w1, w0, u, v) \ ({union {DItype __ll; \ struct {SItype __l, __h;} __i; \ } __x; \ __asm__ ("mulwx %2,%0" \ : "=r" (__x.__ll) \ : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) #define __umulsidi3(u, v) \ ({UDItype __w; \ __asm__ ("mulwux %2,%0" \ : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ __w; }) #endif /* __clipper__ */ /* Fujitsu vector computers. */ #if defined (__uxp__) && W_TYPE_SIZE == 32 #define umul_ppmm(ph, pl, u, v) \ do { \ union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ (ph) = __x.__i.__h; \ (pl) = __x.__i.__l; \ } while (0) #define smul_ppmm(ph, pl, u, v) \ do { \ union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ (ph) = __x.__i.__h; \ (pl) = __x.__i.__l; \ } while (0) #endif #if defined (__gmicro__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add.w %5,%1\n\taddx %3,%0" \ : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "%1" ((USItype)(al)), "g" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "1" ((USItype)(al)), "g" ((USItype)(bl))) #define umul_ppmm(ph, pl, m0, m1) \ __asm__ ("mulx %3,%0,%1" \ : "=g" ((USItype)(ph)), "=r" ((USItype)(pl)) \ : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) #define udiv_qrnnd(q, r, nh, nl, d) \ __asm__ ("divx %4,%0,%1" \ : "=g" ((USItype)(q)), "=r" ((USItype)(r)) \ : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) #define count_leading_zeros(count, x) \ __asm__ ("bsch/1 %1,%0" \ : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) #endif #if defined (__hppa) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl)) #if defined (_PA_RISC1_1) #define umul_ppmm(wh, wl, u, v) \ do { \ union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ (wh) = __x.__i.__h; \ (wl) = __x.__i.__l; \ } while (0) #define UMUL_TIME 8 #define UDIV_TIME 60 #else #define UMUL_TIME 40 #define UDIV_TIME 80 #endif #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #endif /* LONGLONG_STANDALONE */ #define count_leading_zeros(count, x) \ do { \ USItype __tmp; \ __asm__ ( \ "ldi 1,%0\n" \ " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ " ldo 16(%0),%0 ; Yes. Perform add.\n" \ " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ " ldo 8(%0),%0 ; Yes. Perform add.\n" \ " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ " ldo 4(%0),%0 ; Yes. Perform add.\n" \ " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ " ldo 2(%0),%0 ; Yes. Perform add.\n" \ " extru %1,30,1,%1 ; Extract bit 1.\n" \ " sub %0,%1,%0 ; Subtract it.\n" \ : "=r" (count), "=r" (__tmp) : "1" (x)); \ } while (0) #endif /* hppa */ #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 #define smul_ppmm(xh, xl, m0, m1) \ do { \ union {DItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("lr %N0,%1\n\tmr %0,%2" \ : "=&r" (__x.__ll) \ : "r" (m0), "r" (m1)); \ (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ } while (0) #define sdiv_qrnnd(q, r, n1, n0, d) \ do { \ union {DItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __x.__i.__h = n1; __x.__i.__l = n0; \ __asm__ ("dr %0,%2" \ : "=r" (__x.__ll) \ : "0" (__x.__ll), "r" (d)); \ (q) = __x.__i.__l; (r) = __x.__i.__h; \ } while (0) #endif #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addl %5,%1\n\tadcl %3,%0" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "%1" ((USItype)(al)), "g" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subl %5,%1\n\tsbbl %3,%0" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "1" ((USItype)(al)), "g" ((USItype)(bl))) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("mull %3" \ : "=a" (w0), "=d" (w1) \ : "%0" ((USItype)(u)), "rm" ((USItype)(v))) #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ __asm__ ("divl %4" /* stringification in K&R C */ \ : "=a" (q), "=d" (r) \ : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) /* P5 bsrl takes between 10 and 72 cycles depending where the most significant 1 bit is, hence the use of the alternatives below. bsfl is slow too, between 18 and 42 depending where the least significant 1 bit is. The faster count_leading_zeros are pressed into service via the generic count_trailing_zeros at the end of the file. */ #if defined(HAVE_HOST_CPU_i586) && HAVE_HOST_CPU_i586 || \ defined(HAVE_HOST_CPU_pentium) && HAVE_HOST_CPU_pentium /* The following should be a fixed 14 cycles or so. Some scheduling opportunities should be available between the float load/store too. This is used (with "n&-n" to get trailing zeros) in gcc 3 for __builtin_ffs and is apparently suggested by the Intel optimizing manual (don't know exactly where). gcc 2.95 or up will be best for this, so the "double" is correctly aligned on the stack. */ #define count_leading_zeros(c,n) \ do { \ union { \ double d; \ unsigned a[2]; \ } __u; \ ASSERT ((n) != 0); \ __u.d = (UWtype) (n); \ (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ } while (0) #define COUNT_LEADING_ZEROS_0 (0x3FF + 31) #else /* ! pentium */ #if defined(HAVE_HOST_CPU_pentiummmx) && HAVE_HOST_CPU_pentiummmx /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 cache miss reading from __clz_tab. It's favoured over the float above so as to avoid mixing MMX and x87, since the penalty for switching between the two is about 100 cycles. The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 16, -1 for 8, or 0 otherwise. This could be written equivalently as follows, but as of gcc 2.95.2 it results in conditional jumps. __shift = -(__n < 0x1000000); __shift -= (__n < 0x10000); __shift -= (__n < 0x100); The middle two sbbl and cmpl's pair, and with luck something gcc generates might pair with the first cmpl and the last sbbl. The "32+1" constant could be folded into __clz_tab[], but it doesn't seem worth making a different table just for that. */ #define count_leading_zeros(c,n) \ do { \ USItype __n = (n); \ USItype __shift; \ __asm__ ("cmpl $0x1000000, %1\n" \ "sbbl %0, %0\n" \ "cmpl $0x10000, %1\n" \ "sbbl $0, %0\n" \ "cmpl $0x100, %1\n" \ "sbbl $0, %0\n" \ : "=&r" (__shift) : "r" (__n)); \ __shift = __shift*8 + 24 + 1; \ (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ } while (0) #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ #else /* !pentiummmx */ /* On P6, gcc prior to 3.0 generates a partial register stall for __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the cost of one extra instruction. Do this for "i386" too, since that means generic x86. */ #if __GNUC__ < 3 \ && (defined(HAVE_HOST_CPU_i386) && HAVE_HOST_CPU_i386 \ || defined(HAVE_HOST_CPU_i686) && HAVE_HOST_CPU_i686 \ || defined(HAVE_HOST_CPU_pentiumpro) && HAVE_HOST_CPU_pentiumpro \ || defined(HAVE_HOST_CPU_pentium2) && HAVE_HOST_CPU_pentium2 \ || defined(HAVE_HOST_CPU_pentium3) && HAVE_HOST_CPU_pentium3) #define count_leading_zeros(count, x) \ do { \ USItype __cbtmp; \ ASSERT ((x) != 0); \ __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ (count) = 31 - __cbtmp; \ } while (0) #else #define count_leading_zeros(count, x) \ do { \ USItype __cbtmp; \ ASSERT ((x) != 0); \ __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ (count) = __cbtmp ^ 31; \ } while (0) #endif #define count_trailing_zeros(count, x) \ do { \ ASSERT ((x) != 0); \ __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \ } while (0) #endif /* ! pentiummmx */ #endif /* ! pentium */ #ifndef UMUL_TIME #define UMUL_TIME 10 #endif #ifndef UDIV_TIME #define UDIV_TIME 40 #endif #endif /* 80x86 */ #if defined (__x86_64__) && W_TYPE_SIZE == 64 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addq %5,%1\n\tadcq %3,%0" \ : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl)) \ : "0" ((UDItype)(ah)), "g" ((UDItype)(bh)), \ "%1" ((UDItype)(al)), "g" ((UDItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subq %5,%1\n\tsbbq %3,%0" \ : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl)) \ : "0" ((UDItype)(ah)), "g" ((UDItype)(bh)), \ "1" ((UDItype)(al)), "g" ((UDItype)(bl))) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("mulq %3" \ : "=a" (w0), "=d" (w1) \ : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ __asm__ ("divq %4" /* stringification in K&R C */ \ : "=a" (q), "=d" (r) \ : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) #define count_leading_zeros(count, x) \ do { \ UDItype __cbtmp; \ ASSERT ((x) != 0); \ __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ (count) = __cbtmp ^ 63; \ } while (0) /* bsfq destination must be a 64-bit register, "%q0" forces this in case count is only an int. */ #define count_trailing_zeros(count, x) \ do { \ ASSERT ((x) != 0); \ __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \ } while (0) #endif /* x86_64 */ #if defined (__i860__) && W_TYPE_SIZE == 32 #define rshift_rhlc(r,h,l,c) \ __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ "=r" (r) : "r" (h), "r" (l), "rn" (c)) #endif /* i860 */ #if defined (__i960__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ : "=r" (sh), "=&r" (sl) \ : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ : "=r" (sh), "=&r" (sl) \ : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) #define umul_ppmm(w1, w0, u, v) \ ({union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __x; \ __asm__ ("emul %2,%1,%0" \ : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) #define __umulsidi3(u, v) \ ({UDItype __w; \ __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ __w; }) #define udiv_qrnnd(q, r, nh, nl, d) \ do { \ union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __nn; \ __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ __asm__ ("ediv %d,%n,%0" \ : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ } while (0) #define count_leading_zeros(count, x) \ do { \ USItype __cbtmp; \ __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ (count) = __cbtmp ^ 31; \ } while (0) #define COUNT_LEADING_ZEROS_0 (-32) /* sic */ #if defined (__i960mx) /* what is the proper symbol to test??? */ #define rshift_rhlc(r,h,l,c) \ do { \ union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __nn; \ __nn.__i.__h = (h); __nn.__i.__l = (l); \ __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ } #endif /* i960mx */ #endif /* i960 */ #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ || defined (__mc5307__)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ "%1" ((USItype)(al)), "g" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ "1" ((USItype)(al)), "g" ((USItype)(bl))) /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ #if defined (__mc68020__) || defined(mc68020) \ || defined (__mc68030__) || defined (mc68030) \ || defined (__mc68040__) || defined (mc68040) \ || defined (__mcpu32__) || defined (mcpu32) \ || defined (__NeXT__) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("mulu%.l %3,%1:%0" \ : "=d" ((USItype)(w0)), "=d" ((USItype)(w1)) \ : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) #define UMUL_TIME 45 #define udiv_qrnnd(q, r, n1, n0, d) \ __asm__ ("divu%.l %4,%1:%0" \ : "=d" ((USItype)(q)), "=d" ((USItype)(r)) \ : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) #define UDIV_TIME 90 #define sdiv_qrnnd(q, r, n1, n0, d) \ __asm__ ("divs%.l %4,%1:%0" \ : "=d" ((USItype)(q)), "=d" ((USItype)(r)) \ : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) #else /* for other 68k family members use 16x16->32 multiplication */ #define umul_ppmm(xh, xl, a, b) \ do { USItype __umul_tmp1, __umul_tmp2; \ __asm__ ("| Inlined umul_ppmm\n" \ " move%.l %5,%3\n" \ " move%.l %2,%0\n" \ " move%.w %3,%1\n" \ " swap %3\n" \ " swap %0\n" \ " mulu%.w %2,%1\n" \ " mulu%.w %3,%0\n" \ " mulu%.w %2,%3\n" \ " swap %2\n" \ " mulu%.w %5,%2\n" \ " add%.l %3,%2\n" \ " jcc 1f\n" \ " add%.l %#0x10000,%0\n" \ "1: move%.l %2,%3\n" \ " clr%.w %2\n" \ " swap %2\n" \ " swap %3\n" \ " clr%.w %3\n" \ " add%.l %3,%1\n" \ " addx%.l %2,%0\n" \ " | End inlined umul_ppmm" \ : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \ "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ } while (0) #define UMUL_TIME 100 #define UDIV_TIME 400 #endif /* not mc68020 */ /* The '020, '030, '040 and '060 have bitfield insns. GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to exclude bfffo on that chip (bitfield insns not available). */ #if (defined (__mc68020__) || defined (mc68020) \ || defined (__mc68030__) || defined (mc68030) \ || defined (__mc68040__) || defined (mc68040) \ || defined (__mc68060__) || defined (mc68060) \ || defined (__NeXT__)) \ && ! defined (__mcpu32__) #define count_leading_zeros(count, x) \ __asm__ ("bfffo %1{%b2:%b2},%0" \ : "=d" ((USItype) (count)) \ : "od" ((USItype) (x)), "n" (0)) #define COUNT_LEADING_ZEROS_0 32 #endif #endif /* mc68000 */ #if defined (__m88000__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) #define count_leading_zeros(count, x) \ do { \ USItype __cbtmp; \ __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ (count) = __cbtmp ^ 31; \ } while (0) #define COUNT_LEADING_ZEROS_0 63 /* sic */ #if defined (__m88110__) #define umul_ppmm(wh, wl, u, v) \ do { \ union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ (wh) = __x.__i.__h; \ (wl) = __x.__i.__l; \ } while (0) #define udiv_qrnnd(q, r, n1, n0, d) \ ({union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x, __q; \ __x.__i.__h = (n1); __x.__i.__l = (n0); \ __asm__ ("divu.d %0,%1,%2" \ : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) #define UMUL_TIME 5 #define UDIV_TIME 25 #else #define UMUL_TIME 17 #define UDIV_TIME 150 #endif /* __m88110__ */ #endif /* __m88000__ */ #if defined (__mips) && W_TYPE_SIZE == 32 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 7 #define umul_ppmm(w1, w0, u, v) \ __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) #else #define umul_ppmm(w1, w0, u, v) \ __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) #endif #define UMUL_TIME 10 #define UDIV_TIME 100 #endif /* __mips */ /* copied from GMP-5.0.4 longlong.h */ #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 #if __GMP_GNUC_PREREQ (4,4) #define umul_ppmm(w1, w0, u, v) \ do { \ typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ w1 = __ll >> 64; \ w0 = __ll; \ } while (0) #endif #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) #endif #if !defined (umul_ppmm) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) #endif #define UMUL_TIME 20 #define UDIV_TIME 140 #endif /* __mips */ #if defined (__ns32000__) && W_TYPE_SIZE == 32 #define umul_ppmm(w1, w0, u, v) \ ({union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __x; \ __asm__ ("meid %2,%0" \ : "=g" (__x.__ll) \ : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) #define __umulsidi3(u, v) \ ({UDItype __w; \ __asm__ ("meid %2,%0" \ : "=g" (__w) \ : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ __w; }) #define udiv_qrnnd(q, r, n1, n0, d) \ ({union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __x; \ __x.__i.__h = (n1); __x.__i.__l = (n0); \ __asm__ ("deid %2,%0" \ : "=g" (__x.__ll) \ : "0" (__x.__ll), "g" ((USItype)(d))); \ (r) = __x.__i.__l; (q) = __x.__i.__h; }) #define count_trailing_zeros(count,x) \ do { \ __asm__ ("ffsd %2,%0" \ : "=r" ((USItype) (count)) \ : "0" ((USItype) 0), "r" ((USItype) (x))); \ } while (0) #endif /* __ns32000__ */ /* FIXME: We should test _IBMR2 here when we add assembly support for the system vendor compilers. */ #if (defined (_ARCH_PPC) /* AIX */ \ || defined (_ARCH_PWR) /* AIX */ \ || defined (__powerpc__) /* gcc */ \ || defined (__POWERPC__) /* BEOS */ \ || defined (__ppc__) /* Darwin */ \ || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */ \ || (defined (PPC) && defined (CPU_FAMILY) /* VxWorks */ \ && CPU_FAMILY == PPC) \ ) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else \ __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ } while (0) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (ah) && (ah) == 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ else \ __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ } while (0) #define count_leading_zeros(count, x) \ __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x)) #define COUNT_LEADING_ZEROS_0 32 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \ || defined (__ppc__) \ || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */ \ || (defined (PPC) && defined (CPU_FAMILY) /* VxWorks */ \ && CPU_FAMILY == PPC) #define umul_ppmm(ph, pl, m0, m1) \ do { \ USItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 15 #define smul_ppmm(ph, pl, m0, m1) \ do { \ SItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define SMUL_TIME 14 #define UDIV_TIME 120 #else #define UMUL_TIME 8 #define smul_ppmm(xh, xl, m0, m1) \ __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) #define SMUL_TIME 4 #define sdiv_qrnnd(q, r, nh, nl, d) \ __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) #define UDIV_TIME 100 #endif #endif /* 32-bit POWER architecture variants. */ /* We should test _IBMR2 here when we add assembly support for the system vendor compilers. */ #if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else \ __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ } while (0) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (ah) && (ah) == 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ else \ __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ : "=r" (sh), "=&r" (sl) \ : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ } while (0) #define count_leading_zeros(count, x) \ __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) #define COUNT_LEADING_ZEROS_0 64 #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 15 #define smul_ppmm(ph, pl, m0, m1) \ do { \ DItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) #define SMUL_TIME 14 /* ??? */ #define UDIV_TIME 120 /* ??? */ #endif /* 64-bit PowerPC. */ #if defined (__pyr__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addw %5,%1\n\taddwc %3,%0" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "%1" ((USItype)(al)), "g" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "1" ((USItype)(al)), "g" ((USItype)(bl))) /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ #define umul_ppmm(w1, w0, u, v) \ ({union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ : "=&r" (__x.__ll) \ : "g" ((USItype) (u)), "g" ((USItype)(v))); \ (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) #endif /* __pyr__ */ #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("a %1,%5\n\tae %0,%3" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ "%1" ((USItype)(al)), "r" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("s %1,%5\n\tse %0,%3" \ : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ "1" ((USItype)(al)), "r" ((USItype)(bl))) #define smul_ppmm(ph, pl, m0, m1) \ __asm__ ( \ "s r2,r2\n" \ " mts r10,%2\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " m r2,%3\n" \ " cas %0,r2,r0\n" \ " mfs r10,%1" \ : "=r" ((USItype)(ph)), "=r" ((USItype)(pl)) \ : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ : "r2") #define UMUL_TIME 20 #define UDIV_TIME 200 #define count_leading_zeros(count, x) \ do { \ if ((x) >= 0x10000) \ __asm__ ("clz %0,%1" \ : "=r" ((USItype)(count)) : "r" ((USItype)(x) >> 16)); \ else \ { \ __asm__ ("clz %0,%1" \ : "=r" ((USItype)(count)) : "r" ((USItype)(x))); \ (count) += 16; \ } \ } while (0) #endif /* RT/ROMP */ #if defined (__sh2__) && W_TYPE_SIZE == 32 #define umul_ppmm(w1, w0, u, v) \ __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") #define UMUL_TIME 5 #endif #if defined (__sparc__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ __CLOBBER_CC) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ __CLOBBER_CC) /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h doesn't define anything to indicate that to us, it only sets __sparcv8. */ #if defined (__sparc_v9__) || defined (__sparcv9) /* Perhaps we should use floating-point operations here? */ #if 0 /* Triggers a bug making mpz/tests/t-gcd.c fail. Perhaps we simply need explicitly zero-extend the inputs? */ #define umul_ppmm(w1, w0, u, v) \ __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") #else /* Use v8 umul until above bug is fixed. */ #define umul_ppmm(w1, w0, u, v) \ __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) #endif /* Use a plain v8 divide for v9. */ #define udiv_qrnnd(q, r, n1, n0, d) \ do { \ USItype __q; \ __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ (r) = (n0) - __q * (d); \ (q) = __q; \ } while (0) #else #if defined (__sparc_v8__) /* gcc normal */ \ || defined (__sparcv8) /* gcc solaris */ /* Don't match immediate range because, 1) it is not often useful, 2) the 'I' flag thinks of the range as a 13 bit signed interval, while we want to match a 13 bit interval, sign extended to 32 bits, but INTERPRETED AS UNSIGNED. */ #define umul_ppmm(w1, w0, u, v) \ __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) #define UMUL_TIME 5 #if defined(HAVE_HOST_CPU_supersparc) && HAVE_HOST_CPU_supersparc #define UDIV_TIME 60 /* SuperSPARC timing */ #else /* Don't use this on SuperSPARC because its udiv only handles 53 bit dividends and will trap to the kernel for the rest. */ #define udiv_qrnnd(q, r, n1, n0, d) \ do { \ USItype __q; \ __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ (r) = (n0) - __q * (d); \ (q) = __q; \ } while (0) #define UDIV_TIME 25 #endif /* HAVE_HOST_CPU_supersparc */ #else /* ! __sparc_v8__ */ #if defined (__sparclite__) /* This has hardware multiply but not divide. It also has two additional instructions scan (ffs from high bit) and divscc. */ #define umul_ppmm(w1, w0, u, v) \ __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) #define UMUL_TIME 5 #define udiv_qrnnd(q, r, n1, n0, d) \ __asm__ ("! Inlined udiv_qrnnd\n" \ " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ " tst %%g0\n" \ " divscc %3,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%%g1\n" \ " divscc %%g1,%4,%0\n" \ " rd %%y,%1\n" \ " bl,a 1f\n" \ " add %1,%4,%1\n" \ "1: ! End of inline udiv_qrnnd" \ : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ : "%g1" __AND_CLOBBER_CC) #define UDIV_TIME 37 #define count_leading_zeros(count, x) \ __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) /* Early sparclites return 63 for an argument of 0, but they warn that future implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 undefined. */ #endif /* __sparclite__ */ #endif /* __sparc_v8__ */ #endif /* __sparc_v9__ */ /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ #ifndef umul_ppmm #define umul_ppmm(w1, w0, u, v) \ __asm__ ("! Inlined umul_ppmm\n" \ " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ " sra %3,31,%%g2 ! Don't move this insn\n" \ " and %2,%%g2,%%g2 ! Don't move this insn\n" \ " andcc %%g0,0,%%g1 ! Don't move this insn\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,%3,%%g1\n" \ " mulscc %%g1,0,%%g1\n" \ " add %%g1,%%g2,%0\n" \ " rd %%y,%1" \ : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ : "%g1", "%g2" __AND_CLOBBER_CC) #define UMUL_TIME 39 /* 39 instructions */ #endif #ifndef udiv_qrnnd #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #ifndef UDIV_TIME #define UDIV_TIME 140 #endif #endif /* LONGLONG_STANDALONE */ #endif /* udiv_qrnnd */ #endif /* __sparc__ */ #if defined (__sparc__) && W_TYPE_SIZE == 64 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ( \ "addcc %r4,%5,%1\n" \ " addccc %r6,%7,%%g0\n" \ " addc %r2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \ __CLOBBER_CC) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ( \ "subcc %r4,%5,%1\n" \ " subccc %r6,%7,%%g0\n" \ " subc %r2,%3,%0" \ : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \ "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ __CLOBBER_CC) #endif #if defined (__vax__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "%1" ((USItype)(al)), "g" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl)) \ : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ "1" ((USItype)(al)), "g" ((USItype)(bl))) #define smul_ppmm(xh, xl, m0, m1) \ do { \ union {UDItype __ll; \ struct {USItype __l, __h;} __i; \ } __x; \ USItype __m0 = (m0), __m1 = (m1); \ __asm__ ("emul %1,%2,$0,%0" \ : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ } while (0) #define sdiv_qrnnd(q, r, n1, n0, d) \ do { \ union {DItype __ll; \ struct {SItype __l, __h;} __i; \ } __x; \ __x.__i.__h = n1; __x.__i.__l = n0; \ __asm__ ("ediv %3,%2,%0,%1" \ : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ } while (0) #if 0 /* FIXME: This instruction appears to be unimplemented on some systems (vax 8800 maybe). */ #define count_trailing_zeros(count,x) \ do { \ __asm__ ("ffs 0, 31, %1, %0" \ : "=g" ((USItype) (count)) \ : "g" ((USItype) (x))); \ } while (0) #endif #endif /* __vax__ */ #if defined (__z8000__) && W_TYPE_SIZE == 16 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl)) \ : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl)) \ : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) #define umul_ppmm(xh, xl, m0, m1) \ do { \ union {long int __ll; \ struct {unsigned int __h, __l;} __i; \ } __x; \ unsigned int __m0 = (m0), __m1 = (m1); \ __asm__ ("mult %S0,%H3" \ : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ : "%1" (m0), "rQR" (m1)); \ (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ (xh) += ((((signed int) __m0 >> 15) & __m1) \ + (((signed int) __m1 >> 15) & __m0)); \ } while (0) #endif /* __z8000__ */ #endif /* __GNUC__ */ #endif /* NO_ASM */ #ifdef _MSC_VER # include # if defined( _WIN64 ) # define count_leading_zeros(c,x) \ do { \ ASSERT ((x) != 0); \ _BitScanReverse64(&c, (x)); \ c = 63 - c; \ } while (0) # define count_trailing_zeros(c,x) \ do { \ ASSERT ((x) != 0); \ _BitScanForward64(&c, (x)); \ } while (0) # define umul_ppmm(xh, xl, m0, m1) \ do { \ xl = _umul128( (m0), (m1), &xh); \ } while (0) # else # define count_leading_zeros(c,x) \ do { \ ASSERT ((x) != 0); \ _BitScanReverse(&c, (x)); \ c = 31 - c; \ } while (0) # define count_trailing_zeros(c,x) \ do { \ ASSERT ((x) != 0); \ _BitScanForward(&c, (x)); \ } while (0) # define umul_ppmm(xh, xl, m0, m1) \ do { unsigned __int64 _t; \ _t = __emulu( (m0), (m1)); \ xl = _t & 0xffffffff; \ xh = _t >> 32; \ } while (0) # endif #endif #if !defined (umul_ppmm) && defined (__umulsidi3) #define umul_ppmm(ph, pl, m0, m1) \ { \ UDWtype __ll = __umulsidi3 (m0, m1); \ ph = (UWtype) (__ll >> W_TYPE_SIZE); \ pl = (UWtype) __ll; \ } #endif #if !defined (__umulsidi3) #define __umulsidi3(u, v) \ ({UWtype __hi, __lo; \ umul_ppmm (__hi, __lo, u, v); \ ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) #endif /* Note the prototypes are under !define(umul_ppmm) etc too, since the HPPA versions above are different and we don't want to conflict. */ #if ! defined (umul_ppmm) && \ defined(HAVE_NATIVE_mpn_umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm #define mpn_umul_ppmm __MPN(umul_ppmm) extern mp_limb_t mpn_umul_ppmm _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t)); #define umul_ppmm(wh, wl, u, v) \ do { \ mp_limb_t __umul_ppmm__p0; \ (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0, \ (mp_limb_t) (u), (mp_limb_t) (v)); \ (wl) = __umul_ppmm__p0; \ } while (0) #endif #if ! defined (udiv_qrnnd) && \ defined(HAVE_NATIVE_mpn_udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t)); #define udiv_qrnnd(q, r, n1, n0, d) \ do { \ mp_limb_t __udiv_qrnnd__r; \ (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \ (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \ (r) = __udiv_qrnnd__r; \ } while (0) #endif /* If this machine has no inline assembler, use C macros. */ #if !defined (add_ssaaaa) #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ UWtype __x; \ __x = (al) + (bl); \ (sh) = (ah) + (bh) + (__x < (al)); \ (sl) = __x; \ } while (0) #endif #if !defined (sub_ddmmss) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ UWtype __x; \ __x = (al) - (bl); \ (sh) = (ah) - (bh) - (__x > (al)); \ (sl) = __x; \ } while (0) #endif /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of smul_ppmm. */ #if !defined (umul_ppmm) && defined (smul_ppmm) #define umul_ppmm(w1, w0, u, v) \ do { \ UWtype __w1; \ UWtype __xm0 = (u), __xm1 = (v); \ smul_ppmm (__w1, w0, __xm0, __xm1); \ (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ } while (0) #endif /* If we still don't have umul_ppmm, define it using plain C. */ #if !defined (umul_ppmm) #define umul_ppmm(w1, w0, u, v) \ do { \ UWtype __x0, __x1, __x2, __x3; \ UHWtype __ul, __vl, __uh, __vh; \ UWtype __u = (u), __v = (v); \ \ __ul = __ll_lowpart (__u); \ __uh = __ll_highpart (__u); \ __vl = __ll_lowpart (__v); \ __vh = __ll_highpart (__v); \ \ __x0 = (UWtype) __ul * __vl; \ __x1 = (UWtype) __ul * __vh; \ __x2 = (UWtype) __uh * __vl; \ __x3 = (UWtype) __uh * __vh; \ \ __x1 += __ll_highpart (__x0);/* this can't give carry */ \ __x1 += __x2; /* but this indeed can */ \ if (__x1 < __x2) /* did we get it? */ \ __x3 += __ll_B; /* yes, add it in the proper pos. */ \ \ (w1) = __x3 + __ll_highpart (__x1); \ (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ } while (0) #endif /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will exist in one form or another. */ #if !defined (smul_ppmm) #define smul_ppmm(w1, w0, u, v) \ do { \ UWtype __w1; \ UWtype __xm0 = (u), __xm1 = (v); \ umul_ppmm (__w1, w0, __xm0, __xm1); \ (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ } while (0) #endif /* Define this unconditionally, so it can be used for debugging. */ #define __udiv_qrnnd_c(q, r, n1, n0, d) \ do { \ UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ \ ASSERT ((d) != 0); \ ASSERT ((n1) < (d)); \ \ __d1 = __ll_highpart (d); \ __d0 = __ll_lowpart (d); \ \ __q1 = (n1) / __d1; \ __r1 = (n1) - __q1 * __d1; \ __m = (UWtype) __q1 * __d0; \ __r1 = __r1 * __ll_B | __ll_highpart (n0); \ if (__r1 < __m) \ { \ __q1--, __r1 += (d); \ if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ if (__r1 < __m) \ __q1--, __r1 += (d); \ } \ __r1 -= __m; \ \ __q0 = __r1 / __d1; \ __r0 = __r1 - __q0 * __d1; \ __m = (UWtype) __q0 * __d0; \ __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ if (__r0 < __m) \ { \ __q0--, __r0 += (d); \ if (__r0 >= (d)) \ if (__r0 < __m) \ __q0--, __r0 += (d); \ } \ __r0 -= __m; \ \ (q) = (UWtype) __q1 * __ll_B | __q0; \ (r) = __r0; \ } while (0) /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through __udiv_w_sdiv (defined in libgcc or elsewhere). */ #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) #define udiv_qrnnd(q, r, nh, nl, d) \ do { \ UWtype __r; \ (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ (r) = __r; \ } while (0) #endif /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ #if !defined (udiv_qrnnd) #define UDIV_NEEDS_NORMALIZATION 1 #define udiv_qrnnd __udiv_qrnnd_c #endif #if !defined (count_leading_zeros) #define count_leading_zeros(count, x) \ do { \ UWtype __xr = (x); \ UWtype __a; \ \ if (W_TYPE_SIZE == 32) \ { \ __a = __xr < ((UWtype) 1 << 2*__BITS4) \ ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ : 3*__BITS4 + 1); \ } \ else \ { \ for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ if (((__xr >> __a) & 0xff) != 0) \ break; \ ++__a; \ } \ \ (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ } while (0) /* This version gives a well-defined value for zero. */ #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #endif #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB extern const unsigned char __GMP_DECLSPEC __clz_tab[128]; #endif #if !defined (count_trailing_zeros) /* Define count_trailing_zeros using count_leading_zeros. The latter might be defined in asm, but if it is not, the C version above is good enough. */ #define count_trailing_zeros(count, x) \ do { \ UWtype __ctz_x = (x); \ UWtype __ctz_c; \ ASSERT (__ctz_x != 0); \ count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ (count) = W_TYPE_SIZE - 1 - __ctz_c; \ } while (0) #endif #ifndef UDIV_NEEDS_NORMALIZATION #define UDIV_NEEDS_NORMALIZATION 0 #endif /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and that hence the latter should always be used. */ #ifndef UDIV_PREINV_ALWAYS #define UDIV_PREINV_ALWAYS 0 #endif /* Give defaults for UMUL_TIME and UDIV_TIME. */ #ifndef UMUL_TIME #define UMUL_TIME 1 #endif #ifndef UDIV_TIME #define UDIV_TIME UMUL_TIME #endif ecm-6.4.4/ecm2.c0000644023561000001540000010432312106741273010213 00000000000000/* Elliptic Curve Method implementation: stage 2 routines. Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Paul Zimmermann, Alexander Kruppa, Pierrick Gaudry, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-impl.h" /* R_i <- q_i * S, 0 <= i < n, where q_i are large integers, S is a point on an elliptic curve. Uses max(bits in q_i) modular inversions (one less if max(q_i) is a power of 2). Needs up to n+2 cells in T. Returns whether factor was found or not found, factor goes into p. No error can occur. */ static int multiplyW2n (mpz_t p, point *R, curve *S, mpz_t *q, const unsigned int n, mpmod_t modulus, mpres_t u, mpres_t v, mpres_t *T, unsigned long *tot_muls, unsigned long *tot_gcds) { unsigned int i, maxbit, k; /* k is the number of values to batch invert */ unsigned int l, t, muls = 0, gcds = 0; #ifdef WANT_EXPCOST unsigned int hamweight = 0; #endif int youpi = ECM_NO_FACTOR_FOUND; mpz_t flag; /* Used as bit field, keeps track of which R[i] contain partial results */ point s; /* 2^t * S */ mpz_t signs; /* Used as bit field, i-th bit is set iff q[i]<0 */ #ifdef WANT_ASSERT mpz_t __dummy; /* used for local computations */ #endif if (n == 0) return ECM_NO_FACTOR_FOUND; /* Is S the neutral element ? */ if (mpres_is_zero (S->x, modulus) && mpres_is_zero (S->y, modulus)) { for (i = 0; i < n; i++) { mpres_set (R[i].x, S->x, modulus); mpres_set (R[i].y, S->y, modulus); } return ECM_NO_FACTOR_FOUND; } MPZ_INIT2 (flag, n); MPZ_INIT2 (signs, n); mpres_init (s.x, modulus); mpres_init (s.y, modulus); mpres_set (s.x, S->x, modulus); mpres_set (s.y, S->y, modulus); /* Set maxbit to index of highest set bit among all the q[i] */ /* Index of highest bit of q is sizeinbase(q, 2) - 1 */ maxbit = 0; for (i = 0; i < n; i++) { /* We'll first compute positive multiples and change signs later */ if (mpz_sgn (q[i]) < 0) { mpz_setbit (signs, i);; mpz_neg (q[i], q[i]); } /* Multiplier == 0? Then set result to neutral element */ if (mpz_sgn (q[i]) == 0) { mpres_set_ui (R[i].x, 0, modulus); mpres_set_ui (R[i].y, 0, modulus); } #ifdef WANT_EXPCOST else hamweight += mpz_popcount (q[i]) - 1; #endif if ((t = mpz_sizeinbase (q[i], 2) - 1) > maxbit) maxbit = t; } #ifdef WANT_EXPCOST outputf (OUTPUT_ALWAYS, "Expecting %d multiplications and %d extgcds\n", 4 * (maxbit) + 6 * hamweight - 3, maxbit + 1); /* maxbit is floor(log_2(max(q_i))) */ #endif for (t = 0; t <= maxbit && !youpi; t++) /* Examine t-th bit of the q[i] */ { /* See which values need inverting and put them into T[]. Keep number of those values in k */ k = 0; /* Will we have to double s at the end of this pass? If yes, schedule 2*s.y for inverting */ if (t < maxbit) mpres_add (T[k++], s.y, s.y, modulus); for (i = 0; i < n && !youpi; i++) if (mpz_tstbit (q[i], t)) /* If q[i] & (1< 0) mpres_mul (T[k], T[k], T[k - 1], modulus); k++; } /* If No: we'll simply set R[i] to s later on, nothing tbd here */ /* So there are k values in need of inverting, call them v[m], 0 <= m < k. */ /* Here T[m], 0 <= m < k, contains v[0]*...*v[m] */ /* Put inverse of the product of all scheduled values in T[k]*/ if (k > 0) { muls += 3 * (k - 1); gcds++; if (!mpres_invert (T[k], T[k - 1], modulus)) { /* If a factor was found, put factor in p, flag success and bail out of loop */ if (p != NULL) mpres_gcd (p, T[k - 1], modulus); youpi = ECM_FACTOR_FOUND_STEP2; break; } } /* T[k] now contains 1/(v[0]*...*v[k - 1]), T[m], 0 <= m < k, still contain v[0]*...*v[m] */ l = k - 1; for (i = n; i-- > 0; ) /* Go through the R[i] again, backwards */ if (mpz_tstbit (q[i], t)) { if (mpz_tstbit (flag, i)) { /* T[k] contains 1/(v[0]*...*v[l]) */ if (l > 0) /* need to separate the values */ { /* T[l - 1] has v[0]*...*v[l-1] */ mpres_mul (T[l], T[l - 1], T[k], modulus); /* So T[l] now has 1/v[l] == 1/(s.x - R[i].x) */ mpres_sub (u, s.x, R[i].x, modulus); mpres_mul (T[k], T[k], u, modulus); /* T[k] now has 1/(v[0]*...*v[l - 1]) */ } else { /* T[k] contains 1/v[0] */ mpres_set (T[0], T[k], modulus); } /* 1/(s.x - R[i].x) is in T[l] */ #ifdef WANT_ASSERT mpres_sub (u, s.x, R[i].x, modulus); mpres_mul (u, u, T[l], modulus); mpz_init(__dummy); mpres_get_z (__dummy, u, modulus); mpz_mod (__dummy, __dummy, modulus->orig_modulus); if (mpz_cmp_ui (__dummy, 1) != 0) outputf (OUTPUT_ERROR, "Error, (s.x - R[%d].x) * T[%d] == " "%Zd\n", i, l, __dummy); mpz_clear(__dummy); #endif mpres_sub (u, s.y, R[i].y, modulus); /* U = y2 - y1 */ mpres_mul (T[l], T[l], u, modulus); /* T[l] = (y2-y1)/(x2-x1) = lambda */ mpres_sqr (u, T[l], modulus); /* U = lambda^2 */ mpres_sub (u, u, R[i].x, modulus); /* U = lambda^2 - x1 */ mpres_sub (R[i].x, u, s.x, modulus); /* x3 = lambda^2 - x1 - x2 */ mpres_sub (u, s.x, R[i].x, modulus); /* U = x2 - x3 */ mpres_mul (u, u, T[l], modulus); /* U = lambda*(x2 - x3) */ mpres_sub (R[i].y, u, s.y, modulus); /* y3 = lambda*(x2 - x3) - y2 */ muls += 3; l--; } else /* R[i] does not contain a partial result. */ { mpres_set (R[i].x, s.x, modulus); /* Just set R[i] to s */ mpres_set (R[i].y, s.y, modulus); mpz_setbit (flag, i); /* and flag it as used */ } } if (t < maxbit) /* Double s */ { ASSERT(l==0); #ifdef WANT_ASSERT mpres_add (u, s.y, s.y, modulus); mpres_mul (u, u, T[k], modulus); mpz_init(__dummy); mpres_get_z (__dummy, u, modulus); mpz_mod (__dummy, __dummy, modulus->orig_modulus); if (mpz_cmp_ui (__dummy, 1) != 0) outputf (OUTPUT_ERROR, "Error, at t==%d, 2*s.y / (2*s.y) == %Zd\n", t, __dummy); mpz_clear(__dummy); #endif /* 1/(2*s.y) is in T[k] */ mpres_sqr (u, s.x, modulus); /* U = X^2 */ mpres_mul_ui (u, u, 3, modulus); /* U = 3*X^2 */ mpres_add (u, u, S->A, modulus); /* U = 3*X^2 + A */ mpres_mul (T[k], T[k], u, modulus); /* T = (3*X^2 + A) / (2*Y) = lambda */ mpres_sqr (u, T[k], modulus); /* U = lambda^2 */ mpres_sub (u, u, s.x, modulus); /* U = lambda^2 - X */ mpres_sub (u, u, s.x, modulus); /* U = lambda^2 - 2*X = s.x' */ mpres_sub (v, s.x, u, modulus); /* V = s.x - s.x' */ mpres_mul (v, v, T[k], modulus); /* V = lambda*(s.x - s.x') */ mpres_sub (s.y, v, s.y, modulus); /* s.y' = lambda*(s.x - s.x') - s.y */ mpres_set (s.x, u, modulus); muls += 4; } } mpres_clear (s.y, modulus); mpres_clear (s.x, modulus); mpz_clear (flag); if (tot_muls != NULL) *tot_muls += muls; if (tot_gcds != NULL) *tot_gcds += gcds; /* Now take inverse points (negative y-coordinate) where q[i] was < 0 */ for (i = 0; i < n; i++) if (mpz_tstbit (signs, i)) { mpz_neg (R[i].y, R[i].y); mpz_neg (q[i], q[i]); } mpz_clear (signs); return youpi; } /* Input: Points X[0]..X[(n+1)*m-1] T is used for temporary values and needs to have (n-1)*m+1 entries. Performs the following loop with only one gcdext, using Montgomery's trick: for (i=0;i 0. Processes neutral (zero), identical and negative points correctly. Return factor found or not (no error can occur here). */ static int addWnm (mpz_t p, point *X, curve *S, mpmod_t modulus, unsigned int m, unsigned int n, mpres_t *T, unsigned long *tot_muls, unsigned long *tot_gcds) { unsigned int k, l; int i, j; if (n == 0 || m == 0) return ECM_NO_FACTOR_FOUND; k = 0; for (i = m - 1; i >= 0; i--) /* Go through the m different lists */ for (j = n - 1; j >= 0; j--) /* Go through each list backwards */ { /* And prepare the values to be inverted */ point *X1, *X2; X1 = X + i * (n + 1) + j; X2 = X + i * (n + 1) + j + 1; /* If either element is the neutral element, nothing tbd here */ if ((mpres_is_zero (X1->x, modulus) && mpres_is_zero (X1->y, modulus)) || (mpres_is_zero (X2->x, modulus) && mpres_is_zero (X2->y, modulus))) continue; mpres_sub (T[k], X2->x, X1->x, modulus); /* Schedule X2.x - X1.x */ if (mpres_is_zero (T[k], modulus)) /* If both x-cordinates are identical */ { /* Are the points identical? Compare y coordinates: */ mpres_sub (T[k], X2->y, X1->y, modulus); if (mpres_is_zero (T[k], modulus)) { /* Yes, we need to double. Schedule 2*X[...].y */ mpres_add (T[k], X1->y, X1->y, modulus); } else /* No, they are inverses. Nothing tbd here */ { #ifdef WANT_ASSERT /* Check that the y coordinates are mutual negatives */ mpres_add (T[k], X2->y, X1->y, modulus); ASSERT (mpres_is_zero (T[k], modulus)); #endif continue; } } if (k > 0) mpres_mul (T[k], T[k], T[k - 1], modulus); k++; } /* v_m = X[i * (n + 1) + j] - X[i * (n + 1) + j + 1], 0 <= j < n, and m = i * n + j */ /* Here T[m] = v_0 * ... * v_m, 0 <= m < k */ if (k > 0 && !mpres_invert (T[k], T[k - 1], modulus)) { if (p != NULL) mpres_gcd (p, T[k - 1], modulus); if (tot_muls != NULL) (*tot_muls) += m * n - 1; if (tot_gcds != NULL) (*tot_gcds) ++; return ECM_FACTOR_FOUND_STEP2; } /* T[k] = 1/(v_0 * ... * v_m), 0 <= m < k */ l = k - 1; for (i = 0; (unsigned) i < m; i++) for (j = 0; (unsigned) j < n; j++) { point *X1, *X2; X1 = X + i * (n + 1) + j; X2 = X + i * (n + 1) + j + 1; /* Is X1 the neutral element? */ if (mpres_is_zero (X1->x, modulus) && mpres_is_zero (X1->y, modulus)) { /* Yes, set X1 to X2 */ mpres_set (X1->x, X2->x, modulus); mpres_set (X1->y, X2->y, modulus); continue; } /* Is X2 the neutral element? If so, X1 stays the same */ if (mpres_is_zero (X2->x, modulus) && mpres_is_zero (X2->y, modulus)) continue; /* Are the x-coordinates identical? */ mpres_sub (T[k + 1], X2->x, X1->x, modulus); if (mpres_is_zero (T[k + 1], modulus)) { /* Are the points inverses of each other? */ mpres_sub (T[k + 1], X2->y, X1->y, modulus); if (!mpres_is_zero (T[k + 1], modulus)) { /* Yes. Set X1 to neutral element */ mpres_set_ui (X1->x, 0, modulus); mpres_set_ui (X1->y, 0, modulus); continue; } /* No, we need to double. Restore T[k+1] */ mpres_sub (T[k + 1], X2->x, X1->x, modulus); } if (l == 0) mpz_set (T[0], T[k]); else mpres_mul (T[l], T[k], T[l - 1], modulus); /* T_l = 1/(v_0 * ... * v_l) * (v_0 * ... * v_{l-1}) = 1/v_l */ if (mpres_is_zero (T[k + 1], modulus)) /* Identical points, so double X1 */ { if (l > 0) { mpres_add (T[k + 1], X1->y, X1->y, modulus); /* T[k+1] = v_{l} */ mpres_mul (T[k], T[k], T[k + 1], modulus); /* T_k = 1/(v_0 * ... * v_l) * v_l = 1/(v_0 * ... * v_{l-1}) */ } mpres_sqr (T[k + 1], X1->x, modulus); mpres_mul_ui (T[k + 1], T[k + 1], 3, modulus); mpres_add (T[k + 1], T[k + 1], S->A, modulus); mpres_mul (T[l], T[k + 1], T[l], modulus); /* T[l] = lambda */ mpres_sqr (T[k + 1], T[l], modulus); /* T1 = lambda^2 */ mpres_sub (T[k + 1], T[k + 1], X1->x, modulus); /* T1 = lambda^2 - x1 */ mpres_sub (X1->x, T[k + 1], X2->x, modulus); /* X1.x = lambda^2 - x1 - x2 = x3 */ mpres_sub (T[k + 1], X2->x, X1->x, modulus); /* T1 = x2 - x3 */ mpres_mul (T[k + 1], T[k + 1], T[l], modulus); /* T1 = lambda*(x2 - x3) */ mpres_sub (X1->y, T[k + 1], X2->y, modulus); /* Y1 = lambda*(x2 - x3) - y2 = y3 */ } else { if (l > 0) { mpres_mul (T[k], T[k], T[k + 1], modulus); /* T_k = 1/(v_0 * ... * v_l) * v_l = 1/(v_0 * ... * v_{l-1}) */ } mpres_sub (T[k + 1], X2->y, X1->y, modulus); /* T1 = y2 - y1 */ mpres_mul (T[l], T[l], T[k + 1], modulus); /* Tl = (y2 - y1) / (x2 - x1) = lambda */ mpres_sqr (T[k + 1], T[l], modulus); /* T1 = lambda^2 */ mpres_sub (T[k + 1], T[k + 1], X1->x, modulus); /* T1 = lambda^2 - x1 */ mpres_sub (X1->x, T[k + 1], X2->x, modulus); /* X1.x = lambda^2 - x1 - x2 = x3 */ mpres_sub (T[k + 1], X2->x, X1->x, modulus); /* T1 = x2 - x3 */ mpres_mul (T[k + 1], T[k + 1], T[l], modulus); /* T1 = lambda*(x2 - x3) */ mpres_sub (X1->y, T[k + 1], X2->y, modulus); /* Y1 = lambda*(x2 - x3) - y2 = y3 */ } l--; } if (tot_muls != NULL) (*tot_muls) += 6 * m * n - 3; if (tot_gcds != NULL) (*tot_gcds) ++; return ECM_NO_FACTOR_FOUND; } /* puts in F[0..dF-1] the successive values of Dickson_{S, a} (j * d2) * s where s is a point on the elliptic curve for j == 1 mod 6, j and d1 coprime. Returns non-zero iff a factor was found (then stored in f) or an error occurred. */ int ecm_rootsF (mpz_t f, listz_t F, root_params_t *root_params, unsigned long dF, curve *s, mpmod_t modulus) { unsigned long i; unsigned long muls = 0, gcds = 0; long st; int youpi = ECM_NO_FACTOR_FOUND; listz_t coeffs; ecm_roots_state_t state; progression_params_t *params = &state.params; /* for less typing */ mpz_t t; if (dF == 0) return ECM_NO_FACTOR_FOUND; st = cputime (); /* Relative cost of point add during init and computing roots assumed =1 */ init_roots_params (params, root_params->S, root_params->d1, root_params->d2, 1.0); outputf (OUTPUT_DEVVERBOSE, "ecm_rootsF: state: nr = %d, dsieve = %d, " "size_fd = %d, S = %d, dickson_a = %d\n", params->nr, params->dsieve, params->size_fd, params->S, params->dickson_a); /* Init finite differences tables */ MPZ_INIT (t); /* t = 0 */ coeffs = init_progression_coeffs (t, params->dsieve, root_params->d2, 1, 6, params->S, params->dickson_a); mpz_clear (t); if (coeffs == NULL) /* error */ { youpi = ECM_ERROR; goto clear; } /* The highest coefficient is the same for all progressions, so set them to one for all but the first progression, later we copy the point. FIXME: can we avoid the multiplication of those points in multiplyW2n() below? */ for (i = params->S + 1; i < params->size_fd; i += params->S + 1) mpz_set_ui (coeffs[i + params->S], 1); /* Allocate memory for fd[] and T[] */ state.fd = (point *) malloc (params->size_fd * sizeof (point)); if (state.fd == NULL) { youpi = ECM_ERROR; goto exit_ecm_rootsF; } for (i = 0; i < params->size_fd; i++) { outputf (OUTPUT_TRACE, "ecm_rootsF: coeffs[%d] = %Zd\n", i, coeffs[i]); MEMORY_TAG; mpres_init (state.fd[i].x, modulus); MEMORY_TAG; mpres_init (state.fd[i].y, modulus); MEMORY_UNTAG; } state.T = (mpres_t *) malloc ((params->size_fd + 4) * sizeof (mpres_t)); if (state.T == NULL) { youpi = ECM_ERROR; goto ecm_rootsF_clearfdi; } for (i = 0 ; i < params->size_fd + 4; i++) { MEMORY_TAG; mpres_init (state.T[i], modulus); MEMORY_UNTAG; } /* Multiply fd[] = s * coeffs[] */ youpi = multiplyW2n (f, state.fd, s, coeffs, params->size_fd, modulus, state.T[0], state.T[1], state.T + 2, &muls, &gcds); if (youpi == ECM_FACTOR_FOUND_STEP2) outputf (OUTPUT_VERBOSE, "Found factor while computing coeff[] * X\n"); if (youpi == ECM_ERROR) goto clear; /* Copy the point corresponding to the highest coefficient of the first progression to the other progressions */ for (i = params->S + 1; i < params->size_fd; i += params->S + 1) { mpres_set (state.fd[i + params->S].x, state.fd[params->S].x, modulus); mpres_set (state.fd[i + params->S].y, state.fd[params->S].y, modulus); } clear_list (coeffs, params->size_fd); coeffs = NULL; if (test_verbose (OUTPUT_VERBOSE)) { unsigned int st1 = cputime (); outputf (OUTPUT_VERBOSE, "Initializing tables of differences for F took %ldms", elltime (st, st1)); outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); st = st1; muls = 0; gcds = 0; } /* Now for the actual calculation of the roots. */ for (i = 0; i < dF && !youpi;) { /* Is this a rsieve value where we computed Dickson(j * d2) * X? */ if (gcd ((unsigned long) params->rsieve, (unsigned long) params->dsieve) == 1UL) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ youpi = addWnm (f, state.fd, s, modulus, params->nr, params->S, state.T, &muls, &gcds); ASSERT(youpi != ECM_ERROR); /* no error can occur in addWnm */ params->next = 0; if (youpi == ECM_FACTOR_FOUND_STEP2) outputf (OUTPUT_VERBOSE, "Found factor while computing roots of F\n"); } /* Is this a j value where we want Dickson(j * d2) * X as a root? */ if (gcd ((unsigned long) params->rsieve, root_params->d1) == 1UL) mpres_get_z (F[i++], state.fd[params->next * (params->S + 1)].x, modulus); params->next ++; } params->rsieve += 6; } clear: for (i = 0 ; i < params->size_fd + 4; i++) mpres_clear (state.T[i], modulus); free (state.T); ecm_rootsF_clearfdi: for (i = 0; i < params->size_fd; i++) { mpres_clear (state.fd[i].x, modulus); mpres_clear (state.fd[i].y, modulus); } free (state.fd); exit_ecm_rootsF: if (youpi) return youpi; /* error or factor found */ outputf (OUTPUT_VERBOSE, "Computing roots of F took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, ", %ld muls and %ld extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); return ECM_NO_FACTOR_FOUND; } /* Perform the necessary initialization to allow computation of Dickson_{S, a}(s+n*d) * P , where P is a point on the elliptic curve for successive n, where Dickson_{S, a} is the degree S Dickson polynomial with parameter a. For a == 0, Dickson_{S, a} (x) = x^S. If a factor is found during the initialisation, NULL is returned and the factor in f. If an error occurred, NULL is returned and f is -1. */ ecm_roots_state_t * ecm_rootsG_init (mpz_t f, curve *X, root_params_t *root_params, unsigned long dF, unsigned long blocks, mpmod_t modulus) { unsigned int k, phid2; unsigned long muls = 0, gcds = 0; listz_t coeffs; ecm_roots_state_t *state; progression_params_t *params; /* for less typing */ int youpi = 0; unsigned int T_inv; double bestnr; long st = 0; ASSERT (gcd (root_params->d1, root_params->d2) == 1UL); if (test_verbose (OUTPUT_VERBOSE)) st = cputime (); state = (ecm_roots_state_t *) malloc (sizeof (ecm_roots_state_t)); if (state == NULL) { mpz_set_si (f, -1); return NULL; } params = &(state->params); /* If S < 0, use degree |S| Dickson poly, otherwise use x^S */ params->dickson_a = (root_params->S < 0) ? -1 : 0; params->S = abs (root_params->S); /* Estimate the cost of a modular inversion (in unit of time per modular multiplication) */ if (modulus->repr == ECM_MOD_BASE2) T_inv = 18; else T_inv = 6; /* Guesstimate a value for the number of disjoint progressions to use */ bestnr = -(4. + T_inv) + sqrt(12. * (double) dF * (double) blocks * (T_inv - 3.) * log (2. * root_params->d1) / log (2.) - (4. + T_inv) * (4. + T_inv)); bestnr /= 6. * (double) (params->S) * log (2. * root_params->d1) / log (2.0); outputf (OUTPUT_TRACE, "ecm_rootsG_init: bestnr = %f\n", bestnr); if (bestnr < 1.) params->nr = 1; else params->nr = (unsigned int) (bestnr + .5); phid2 = eulerphi (root_params->d2); /* Round up params->nr to multiple of eulerphi(d2) */ if (phid2 > 1) params->nr = ((params->nr + (phid2 - 1)) / phid2) * phid2; params->size_fd = params->nr * (params->S + 1); outputf (OUTPUT_DEVVERBOSE, "ecm_rootsG_init: i0=%Zd, d1=%lu, d2=%lu, " "dF=%lu, blocks=%lu, S=%u, T_inv = %d, nr=%d\n", root_params->i0, root_params->d1, root_params->d2, dF, blocks, params->S, T_inv, params->nr); state->X = X; params->next = 0; params->dsieve = 1; /* We only init progressions coprime to d2, so nothing to be skipped */ params->rsieve = 0; coeffs = init_progression_coeffs (root_params->i0, root_params->d2, root_params->d1, params->nr / phid2, 1, params->S, params->dickson_a); if (coeffs == NULL) /* error */ { free (state); mpz_set_si (f, -1); return NULL; } state->fd = (point *) malloc (params->size_fd * sizeof (point)); if (state->fd == NULL) { clear_list (coeffs, params->size_fd); free (state); mpz_set_si (f, -1); return NULL; } for (k = 0; k < params->size_fd; k++) { MEMORY_TAG; mpres_init (state->fd[k].x, modulus); MEMORY_TAG; mpres_init (state->fd[k].y, modulus); MEMORY_UNTAG; } state->size_T = params->size_fd + 4; state->T = (mpres_t *) malloc (state->size_T * sizeof (mpres_t)); if (state->T == NULL) { for (k = 0; k < params->size_fd; k++) { mpres_clear (state->fd[k].x, modulus); mpres_clear (state->fd[k].y, modulus); } clear_list (coeffs, params->size_fd); free (state); mpz_set_si (f, -1); return NULL; } for (k = 0; k < state->size_T; k++) { MEMORY_TAG; mpres_init (state->T[k], modulus); MEMORY_UNTAG; } for (k = params->S + 1; k < params->size_fd; k += params->S + 1) mpz_set_ui (coeffs[k + params->S], 1); if (test_verbose (OUTPUT_TRACE)) for (k = 0; k < params->size_fd; k++) outputf (OUTPUT_TRACE, "ecm_rootsG_init: coeffs[%d] == %Zd\n", k, coeffs[k]); youpi = multiplyW2n (f, state->fd, X, coeffs, params->size_fd, modulus, state->T[0], state->T[1], state->T + 2, &muls, &gcds); if (youpi == ECM_ERROR) mpz_set_si (f, -1); /* fall through */ for (k = params->S + 1; k < params->size_fd; k += params->S + 1) { mpres_set (state->fd[k + params->S].x, state->fd[params->S].x, modulus); mpres_set (state->fd[k + params->S].y, state->fd[params->S].y, modulus); } clear_list (coeffs, params->size_fd); coeffs = NULL; if (youpi != ECM_NO_FACTOR_FOUND) /* factor found or error */ { if (youpi == ECM_FACTOR_FOUND_STEP2) outputf (OUTPUT_VERBOSE, "Found factor while computing fd[]\n"); ecm_rootsG_clear (state, modulus); /* Signal that a factor was found, or an error occurred (f=-1) */ state = NULL; } else { if (test_verbose (OUTPUT_VERBOSE)) { st = elltime (st, cputime ()); outputf (OUTPUT_VERBOSE, "Initializing table of differences for G took %ldms", st); outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); } } return state; } void ecm_rootsG_clear (ecm_roots_state_t *state, ATTRIBUTE_UNUSED mpmod_t modulus) { unsigned int k; for (k = 0; k < state->params.size_fd; k++) { mpres_clear (state->fd[k].x, modulus); mpres_clear (state->fd[k].y, modulus); } free (state->fd); for (k = 0; k < state->size_T; k++) mpres_clear (state->T[k], modulus); free (state->T); free (state); } /* Puts in G the successive values of Dickson_{S, a}(s+j*k) P where P is a point on the elliptic curve, 0<= j <= dF-1, k is the 'd' value from ecm_rootsG_init() and s is the 's' value of ecm_rootsG_init() or where a previous call to ecm_rootsG has left off. Returns non-zero iff a factor was found (then stored in f). Cannot return an error. */ int ecm_rootsG (mpz_t f, listz_t G, unsigned long dF, ecm_roots_state_t *state, mpmod_t modulus) { unsigned long i; unsigned long muls = 0, gcds = 0; int youpi = ECM_NO_FACTOR_FOUND; long st; point *fd = state->fd; /* to save typing */ progression_params_t *params = &(state->params); /* for less typing */ st = cputime (); outputf (OUTPUT_TRACE, "ecm_rootsG: dF = %lu, state: nr = %u, next = %u, " "S = %u, dsieve = %u, rsieve = %u,\n\tdickson_a = %d\n", dF, params->nr, params->next, params->S, params->dsieve, params->rsieve, params->dickson_a); for (i = 0; i < dF;) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ youpi = addWnm (f, fd, state->X, modulus, params->nr, params->S, state->T, &muls, &gcds); ASSERT(youpi != ECM_ERROR); /* no error can occur in addWnm */ params->next = 0; if (youpi == ECM_FACTOR_FOUND_STEP2) { outputf (OUTPUT_VERBOSE, "Found factor while computing G[]\n"); break; } } /* Is this a root we should skip? (Take only if gcd == 1) */ if (gcd ((unsigned long) params->rsieve, (unsigned long) params->dsieve) == 1UL) { mpres_get_z (G[i++], (fd + params->next * (params->S + 1))->x, modulus); outputf (OUTPUT_TRACE, "ecm_rootsG: storing d1*%u*X = %Zd in G[%lu]\n", params->rsieve, G[i - 1], i); } params->next ++; params->rsieve ++; } outputf (OUTPUT_VERBOSE, "Computing roots of G took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, ", %lu muls and %lu extgcds", muls, gcds); outputf (OUTPUT_VERBOSE, "\n"); return youpi; } /* Find smallest i >= 0 such that f(j * d2)*X = +-f((i0 + i) * d1)*X over GF(p). If "+" holds, return 1, if "-" holds, return -1. If the correct i could not be determined (because a non-invertible residue appeared during initialisation) return 0. */ int ecm_findmatch (unsigned long *I, const unsigned long j, root_params_t *root_params, const curve *X, mpmod_t n, const mpz_t p) { const int dickson_a = root_params->S < 0 ? -1 : 0; const unsigned int S = abs (root_params->S); const unsigned int sizeT = S + 3; unsigned int k; unsigned long i; int r, sgn = 0; point iX, jX; curve Xp; /* The point and curve over GF(p) */ mpmod_t modulus; mpz_t s, t; /* temp vars */ mpres_t u, v; /* temp vars */ listz_t coeffs; point *fd; mpres_t *T; outputf (OUTPUT_RESVERBOSE, "Looking for i such that " "f((i+%Zd)*%lu)*X = f(%lu*%lu)*X\n", root_params->i0, root_params->d1, j, root_params->d2); mpmod_init (modulus, p, ECM_MOD_DEFAULT); mpz_init (s); mpz_init (t); mpres_init (u, modulus); mpres_init (v, modulus); mpres_init (Xp.x, modulus); mpres_init (Xp.y, modulus); mpres_init (Xp.A, modulus); mpres_init (iX.x, modulus); mpres_init (iX.y, modulus); mpres_init (jX.x, modulus); mpres_init (jX.y, modulus); T = malloc (sizeT * sizeof (mpres_t)); if (T == NULL) goto clear_and_exit; for (k = 0; k < sizeT; k++) mpres_init (T[k], modulus); fd = malloc ((S + 1) * sizeof (point)); if (fd == NULL) goto clear_T_and_exit; for (k = 0; k < S + 1; k++) { mpres_init (fd[k].x, modulus); mpres_init (fd[k].y, modulus); } /* Copy the parameters of the curve over Z/ZN to the curve over GF(p) */ mpres_get_z (t, X->x, n); mpres_set_z (Xp.x, t, modulus); mpres_get_z (t, X->y, n); mpres_set_z (Xp.y, t, modulus); mpres_get_z (t, X->A, n); mpres_set_z (Xp.A, t, modulus); /* We use init_progression_coeffs() to compute f(j * d2) */ mpz_set_ui (t, j); coeffs = init_progression_coeffs (t, 1UL, root_params->d2, 1U, 1U, S, dickson_a); if (coeffs == NULL) goto clear_fd_and_exit; /* Now compute f(j * d2) X */ r = multiplyW2n (NULL, &jX, &Xp, coeffs, 1U, modulus, u, v, T, NULL, NULL); clear_list (coeffs, S + 1); if (r != ECM_NO_FACTOR_FOUND) goto clear_fd_and_exit; /* We'll keep {f(j * d2) X}_x in s */ mpres_get_z (s, jX.x, modulus); outputf (OUTPUT_DEVVERBOSE, "ecm_findmatch: (f(j * d2) X)_x = %Zd\n", s); /* Now compute {f((i0 + i) d1) X}_x one at a time and put them in t, until s == t */ /* Init the progression */ coeffs = init_progression_coeffs (root_params->i0, 1UL, root_params->d1, 1U, 1U, S, dickson_a); if (coeffs == NULL) goto clear_fd_and_exit; r = multiplyW2n (NULL, fd, &Xp, coeffs, S + 1, modulus, u, v, T, NULL, NULL); clear_list (coeffs, S + 1); if (r != ECM_NO_FACTOR_FOUND) goto clear_fd_and_exit; mpres_get_z (t, fd[0].x, modulus); for (i = 0; mpz_cmp (s, t) != 0; i++) { r = addWnm (NULL, fd, &Xp, modulus, 1, S, T, NULL, NULL); if (r != ECM_NO_FACTOR_FOUND) goto clear_fd_and_exit; mpres_get_z (t, fd[0].x, modulus); } outputf (OUTPUT_DEVVERBOSE, "ecm_findmatch: i - i0 = %lu, " "{f(i * d1) X}_x = %Zd\n", i, t); /* We'll compute f(i * d1)*X and compare it to f(j * d2)*X to verify correctness of the result, and to determine whether it was f(i * d1)-f(j * d2) or f(i * d1)+f(j * d2) that found the factor */ /* We use init_progression_coeffs() to compute f(i * d1) */ mpz_add_ui (t, root_params->i0, i); coeffs = init_progression_coeffs (t, 1UL, root_params->d1, 1U, 1U, S, dickson_a); if (coeffs == NULL) goto clear_fd_and_exit; /* Now compute iX = f(i * d1)*X */ r = multiplyW2n (NULL, &iX, &Xp, coeffs, 1U, modulus, u, v, T, NULL, NULL); clear_list (coeffs, S + 1); if (r != ECM_NO_FACTOR_FOUND) goto clear_fd_and_exit; mpres_get_z (t, iX.x, modulus); if (mpz_cmp (s, t) != 0) { outputf (OUTPUT_ERROR, "ecm_findmatch: ERROR, (f(i*d1) X)_x != " "(f(j*d2) X)_x\n(f(i*d1) X)_x = %Zd\n", t); goto clear_fd_and_exit; } mpres_get_z (s, jX.y, modulus); mpres_get_z (t, iX.y, modulus); if (mpz_cmp (s, t) == 0) { *I = i; sgn = 1; } else { mpz_sub (t, p, t); if (mpz_cmp (s, t) == 0) { *I = i; sgn = -1; } else { mpz_sub (t, p, t); outputf (OUTPUT_ERROR, "ecm_findmatch: ERROR, (f(i*d1) X)_y != " "+-(f(j*d2) X)_y\n"); outputf (OUTPUT_ERROR, "(f(i*d1) X)_y = %Zd\n", t); outputf (OUTPUT_ERROR, "(f(j*d2) X)_y = %Zd\n", s); } } clear_fd_and_exit: for (k = 0; k < S + 1; k++) { mpres_clear (fd[k].x, modulus); mpres_clear (fd[k].y, modulus); } free(fd); clear_T_and_exit: for (k = 0; k < sizeT; k++) mpres_clear (T[k], modulus); free (T); clear_and_exit: mpz_clear (s); mpz_clear (t); mpres_clear (u, modulus); mpres_clear (v, modulus); mpres_clear (Xp.x, modulus); mpres_clear (Xp.y, modulus); mpres_clear (Xp.A, modulus); mpres_clear (iX.x, modulus); mpres_clear (iX.y, modulus); mpres_clear (jX.x, modulus); mpres_clear (jX.y, modulus); mpmod_clear (modulus); return sgn; } ecm-6.4.4/schoen_strass.c0000644023561000001540000012007612106741273012246 00000000000000/* Arithmetic modulo Fermat numbers. Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Alexander Kruppa, Paul Zimmermann This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include /* for abs if assertions enabled */ #include "ecm-impl.h" #include "ecm-gmp.h" #ifdef HAVE_LIMITS_H # include #else # ifndef UINT_MAX # define UINT_MAX (~(unsigned int) 0) # endif #endif /* #define DEBUG 1 #define CHECKSUM 1 */ static mpz_t gt; static int gt_inited = 0; static int radix2 = 0; unsigned int Fermat; #define CACHESIZE 512U /* a' <- a+b, b' <- a-b. */ #define ADDSUB_MOD(a, b) \ mpz_sub (gt, a, b); \ mpz_add (a, a, b); \ F_mod_gt (b, n); \ F_mod_1 (a, n); __GMP_DECLSPEC mp_limb_t __gmpn_mod_34lsub1 (mp_limb_t*, mp_size_t); /* compute remainder modulo 2^(GMP_LIMB_BITS*3/4)-1 */ #ifndef HAVE___GMPN_MOD_34LSUB1 mp_limb_t __gmpn_mod_34lsub1 (mp_limb_t *src, mp_size_t size) { mp_ptr tp; mp_limb_t r, d; ASSERT(GMP_LIMB_BITS % 4 == 0); tp = malloc (size * sizeof (mp_limb_t)); if (tp == NULL) { fprintf (stderr, "Cannot allocate memory in __gmpn_mod_34lsub1\n"); exit (1); } MPN_COPY (tp, src, size); d = ((mp_limb_t) 1 << (3 * (GMP_LIMB_BITS / 4))) - (mp_limb_t) 1; mpn_divmod_1 (&r, tp, size, d); free (tp); return r; } #endif /* RS -> RS (mod 2^n+1). If input |RS| < 2^(2*n), result |RS| < 2^(n+1) */ static inline void F_mod_1 (mpz_t RS, unsigned int n) { mp_size_t size; mp_limb_t v; size = mpz_size (RS); if ((unsigned int) size == n / GMP_NUMB_BITS + 1) { int sgn; sgn = mpz_sgn (RS); /* Remember original sign */ v = mpz_getlimbn (RS, n / GMP_NUMB_BITS); mpz_tdiv_r_2exp (RS, RS, n); /* Just a truncate. RS < 2^n. Can make RS zero and so change sgn(RS)! */ if (sgn == -1) mpz_add_ui (RS, RS, v); else mpz_sub_ui (RS, RS, v); } else if ((unsigned int) size > n / GMP_NUMB_BITS + 1) { /* Assuming |RS| < 2^(2*n) */ mpz_tdiv_q_2exp (gt, RS, n); /* |gt| < 2^n */ mpz_tdiv_r_2exp (RS, RS, n); /* |RS| < 2^n */ mpz_sub (RS, RS, gt); /* |RS| < 2^(n+1) */ } } /* R = gt (mod 2^n+1) */ static inline void F_mod_gt (mpz_t R, unsigned int n) { mp_size_t size; mp_limb_t v; size = mpz_size (gt); ASSERT(R != gt); if ((unsigned int) size == n / GMP_NUMB_BITS + 1) { int sgn; sgn = mpz_sgn (gt); v = mpz_getlimbn (gt, n / GMP_NUMB_BITS); mpz_tdiv_r_2exp (gt, gt, n); /* Just a truncate */ if (sgn == -1) mpz_add_ui (R, gt, v); else mpz_sub_ui (R, gt, v); } else if ((unsigned int) size > n / GMP_NUMB_BITS + 1) { mpz_tdiv_q_2exp (R, gt, n); mpz_tdiv_r_2exp (gt, gt, n); /* Just a truncate */ mpz_sub (R, gt, R); } else mpz_set (R, gt); } /* R = S1 * S2 (mod 2^n+1) where n is a power of 2 */ /* S1 == S2, S1 == R, S2 == R ok, but none may == gt */ static void F_mulmod (mpz_t R, mpz_t S1, mpz_t S2, unsigned int n) { int n2 = (n - 1) / GMP_NUMB_BITS + 1; /* type of _mp_size is int */ F_mod_1 (S1, n); F_mod_1 (S2, n); if (mpz_size (S1) > (unsigned) n2) { outputf (OUTPUT_ERROR, "Warning: S1 >= 2^%d after reduction, has %lu bits. " "Trying again\n", n, (unsigned long) mpz_sizeinbase (S1, 2)); F_mod_1 (S1, n); } if (mpz_size (S2) > (unsigned) n2) { outputf (OUTPUT_ERROR, "Warning: S2 >= 2^%d after reduction, has %lu bits. " "Trying again\n", n, (unsigned long) mpz_sizeinbase (S2, 2)); F_mod_1 (S2, n); } if (n >= 32768) { unsigned long k; _mpz_realloc (gt, n2 + 1); /* in case the reallocation fails, _mpz_realloc sets the value to 0 */ ASSERT_ALWAYS (mpz_cmp_ui (gt, 0) != 0); k = mpn_fft_best_k (n2, S1 == S2); mpn_mul_fft (PTR(gt), n2, PTR(S1), ABSIZ(S1), PTR(S2), ABSIZ(S2), k); MPN_NORMALIZE(PTR(gt), n2); SIZ(gt) = ((SIZ(S1) ^ SIZ(S2)) >= 0) ? n2 : -n2; F_mod_gt (R, n); return; } mpz_mul (gt, S1, S2); F_mod_gt (R, n); return; } /* R = S + sgn(S)*(2^e) */ static void mpz_absadd_2exp (mpz_t RS, unsigned int e) { mp_size_t siz, limb_idx, bit_idx; mp_limb_t cy; int sgn; limb_idx = e / GMP_NUMB_BITS; bit_idx = e % GMP_NUMB_BITS; siz = mpz_size (RS); sgn = (mpz_sgn (RS) >= 0) ? 1 : -1; if (limb_idx >= RS->_mp_alloc) /* WARNING: mpz_realloc2 does not keep the value!!! */ mpz_realloc2 (RS, (limb_idx + 1) * GMP_NUMB_BITS); /* Now RS->_mp_alloc > limb_idx) */ while (siz <= limb_idx) { RS->_mp_d[siz++] = 0; RS->_mp_size += sgn; } /* Now RS->_mp_alloc >= siz > limb_idx */ cy = mpn_add_1 (RS->_mp_d + limb_idx, RS->_mp_d + limb_idx, siz - limb_idx, ((mp_limb_t)1) << bit_idx); if (cy) { if (RS->_mp_alloc <= siz) /* WARNING: mpz_realloc2 does not keep the value!!! */ mpz_realloc2 (RS, (siz + 1) * GMP_NUMB_BITS); RS->_mp_d[siz] = 1; RS->_mp_size += sgn; } } /* R = S / 2 (mod 2^n + 1). S == gt is ok */ static void F_divby2 (mpz_t R, mpz_t S, unsigned int n) { int odd, sgn; odd = mpz_odd_p (S); sgn = mpz_sgn (S); mpz_tdiv_q_2exp (R, S, 1); if (odd) { /* We shifted out a set bit at the bottom. With negative wrap-around, that becomes -2^(n-1), so we add -2^(n-1) + 2^n+1 = 2^(n-1)+1. If |S| < 2^(n+1), |R| < 2^n + 2^(n-1) + 1 < 2^(n+1) for n > 1. */ mpz_absadd_2exp (R, n - 1); if (sgn < 0) mpz_sub_ui (R, R, 1); else mpz_add_ui (R, R, 1); } } /* RS = RS / 3 (mod 2^n + 1). RS == gt is ok */ static void F_divby3_1 (mpz_t RS, unsigned int n) { /* 2^2^m == 1 (mod 3) for m>0, thus F_m == 2 (mod 3) */ int mod, sgn; sgn = mpz_sgn (RS); mod = __gmpn_mod_34lsub1 (RS->_mp_d, mpz_size (RS)) % 3; if (mod == 1) { /* Add F_m. If |RS| < 2^(n+1), |RS|+F_m < 3*2^n+1 */ mpz_absadd_2exp (RS, n); if (sgn >= 0) mpz_add_ui (RS, RS, 1); else mpz_sub_ui (RS, RS, 1); } else if (mod == 2) { /* Add 2 * F_m. If |RS| < 2^(n+1), |RS|+2*F_m < 4*2^n+2 */ mpz_absadd_2exp (RS, n + 1); if (sgn >= 0) mpz_add_ui (RS, RS, 2); else mpz_sub_ui (RS, RS, 2); } mpz_divby3_1op (RS); /* |RS| < (4*2^n+2)/3 < 2^(n+1) */ } static void F_divby5_1 (mpz_t RS, unsigned int n) { /* 2^2^m == 1 (mod 5) for m>1, thus F_m == 2 (mod 5) */ int mod, sgn; sgn = mpz_sgn (RS); mod = __gmpn_mod_34lsub1 (RS->_mp_d, mpz_size (RS)) % 5; if (mod == 1) { /* Add 2 * F_m == 4 (mod 5) */ mpz_absadd_2exp (RS, n + 1); if (sgn == 1) mpz_add_ui (RS, RS, 2); else mpz_sub_ui (RS, RS, 2); } else if (mod == 2) { /* Add 4 * F_m == 3 (mod 5) */ mpz_absadd_2exp (RS, n + 2); if (sgn == 1) mpz_add_ui (RS, RS, 4); else mpz_sub_ui (RS, RS, 4); } else if (mod == 3) { /* Add F_m == 3 (mod 5) */ mpz_absadd_2exp (RS, n); if (sgn == 1) mpz_add_ui (RS, RS, 1); else mpz_sub_ui (RS, RS, 1); } else if (mod == 4) { /* Add 3 * F_m == 1 (mod 5) */ mpz_absadd_2exp (RS, n); mpz_absadd_2exp (RS, n + 1); if (sgn == 1) mpz_add_ui (RS, RS, 3); else mpz_sub_ui (RS, RS, 3); } ASSERT(mpz_divisible_ui_p (RS, 5)); mpz_divexact_ui (RS, RS, 5); } /* A 2^(m+2) length convolution is possible: (2^(3n/4) - 2^(n/4))^2 == 2 (mod 2^n+1) so we have an element of order 2^(m+2) of simple enough form to use it as a root of unity the transform */ /* Multiply by sqrt(2)^e (mod F_m). n = 2^m */ /* R = (S * sqrt(2)^e) % (2^n+1) */ /* R == S is ok, but neither must be == gt */ /* Assumes abs(e) < 4*n */ static void F_mul_sqrt2exp (mpz_t R, mpz_t S, int e, unsigned int n) { int chgsgn = 0, odd; ASSERT(S != gt); ASSERT(R != gt); ASSERT((unsigned) abs (e) < 4 * n); if (e < 0) e += 4 * n; /* 0 <= e < 4*n */ if ((unsigned) e >= 2 * n) /* sqrt(2)^(2*n) == -1 (mod F_m), so */ { e -= 2 * n; /* sqrt(2)^e == -sqrt(2)^(e-2*n) (mod F_m) */ chgsgn = 1; } /* Now e < 2*n */ #ifdef DEBUG_PERF if (e == 0) outputf (OUTPUT_ALWAYS, "F_mul_sqrt2exp: called for trivial case %s1\n", chgsgn ? "-" : ""); #endif odd = e & 1; e >>= 1; if (odd) { /* Multiply by sqrt(2) == 2^(3n/4) - 2^(n/4) */ /* S * (2^(3n/4) - 2^(n/4)) == 2^(n/4) * (S*2^(n/2) - S) */ mpz_mul_2exp (gt, S, n / 2); mpz_sub (gt, gt, S); mpz_tdiv_q_2exp (R, gt, n / 4 * 3); mpz_tdiv_r_2exp (gt, gt, n / 4 * 3); mpz_mul_2exp (gt, gt, n / 4); mpz_sub (R, gt, R); if (e != 0) { mpz_tdiv_q_2exp (gt, R, n-e); mpz_tdiv_r_2exp (R, R, n-e); mpz_mul_2exp (R, R, e); mpz_sub (R, R, gt); } } else if (e != 0) { /* S = a*2^(n-e) + b, b < 2^(n-e) */ /* S*2^e = a*2^n + b*2^e = b*2^e - a */ /* b*2^e < 2^(n-e)*2^e = 2^n */ mpz_tdiv_q_2exp (gt, S, n - e); /* upper e bits (=a) into gt */ mpz_tdiv_r_2exp (R, S, n - e); /* lower n-e bits (=b) into R */ /* This is simply a truncate if S == R */ mpz_mul_2exp (R, R, e); /* R < 2^n */ mpz_sub (R, R, gt); } else mpz_set (R, S); if (chgsgn) mpz_neg (R, R); } /* Same, but input may be gt. Input and output must not be identical */ static void F_mul_sqrt2exp_2 (mpz_t R, mpz_t S, int e, unsigned int n) { int chgsgn = 0, odd; ASSERT (S != R); ASSERT (R != gt); ASSERT ((unsigned) abs (e) < 4 * n); if (e < 0) e += 4 * n; if ((unsigned) e >= 2 * n) /* sqrt(2)^(2*n) == -1 (mod F_m), so */ { e -= 2 * n; /* sqrt(2)^e == -sqrt(2)^(e-2*n) (mod F_m) */ chgsgn = 1; } /* Now e < 2*n */ #ifdef DEBUG_PERF if (e == 0) outputf (OUTPUT_ALWAYS, "F_mul_sqrt2exp_2: called for trivial case %s1\n", chgsgn ? "-" : ""); #endif odd = e & 1; e >>= 1; if (odd != 0) { mpz_set (R, S); /* Neccessary? n/32 mov*/ mpz_mul_2exp (gt, S, n / 2); /* May overwrite S n/32 mov */ mpz_sub (gt, gt, R); /* n/32 sub*/ mpz_tdiv_q_2exp (R, gt, n / 4 * 3); /* 3*(n/32)/4 mov */ mpz_tdiv_r_2exp (gt, gt, n / 4 * 3); /* Just a truncate */ mpz_mul_2exp (gt, gt, n / 4); /* 3*(n/32)/4 mov */ mpz_sub (R, gt, R); /* (n/32)/4 sub, 3*(n/32)/4 mov */ if (e != 0) { mpz_tdiv_q_2exp (gt, R, n - e); mpz_tdiv_r_2exp (R, R, n - e); mpz_mul_2exp (R, R, e); mpz_sub (R, R, gt); } } else if (e != 0) { mpz_tdiv_q_2exp (R, S, n - e); /* upper e bits into R */ mpz_tdiv_r_2exp (gt, S, n - e); /* lower n-e bits into gt */ mpz_mul_2exp (gt, gt, e); mpz_sub (R, gt, R); } else mpz_set (R, S); if (chgsgn == -1) mpz_neg (R, R); } #define A0s A[0] #define A1s A[l << stride2] #define A2s A[2 * l << stride2] #define A3s A[3 * l << stride2] #define A0is A[i << stride2] #define A1is A[(i + l) << stride2] #define A2is A[(i + 2 * l) << stride2] #define A3is A[(i + 3 * l) << stride2] /* Decimation-in-frequency FFT. Unscrambled input, scrambled output. */ /* Elements are (mod 2^n+1), l and n must be powers of 2, l must be <= 4*n. */ /* Performs forward transform */ static void F_fft_dif (mpz_t *A, int l, int stride2, int n) { int i, omega = (4 * n) / l, iomega; if (l <= 1) return; ASSERT((4 * n) % l == 0); if (l == 2) { ADDSUB_MOD(A[0], A[1< 1) { F_fft_dif (A, l, stride2, n); F_fft_dif (A + (l << stride2), l, stride2, n); F_fft_dif (A + (2 * l << stride2), l, stride2, n); F_fft_dif (A + (3 * l << stride2), l, stride2, n); } return; } l /= 2; ADDSUB_MOD(A[0], A1s); for (i = 1, iomega = omega; i < l; i++, iomega += omega) { mpz_sub (gt, A0is, A1is); mpz_add (A0is, A0is, A1is); F_mul_sqrt2exp_2 (A1is, gt, iomega, n); F_mod_1 (A0is, n); } F_fft_dif (A, l, stride2, n); F_fft_dif (A + (l << stride2), l, stride2, n); } /* Decimation-in-time inverse FFT. Scrambled input, unscrambled output */ /* Does not perform divide-by-length. l, and n as in F_fft_dif() */ static void F_fft_dit (mpz_t *A, int l, int stride2, int n) { int i, omega = (4 * n) / l, iomega; if (l <= 1) return; ASSERT((4 * n) % l == 0); if (l == 2) { ADDSUB_MOD(A[0], A[1< 1) { F_fft_dit (A, l, stride2, n); F_fft_dit (A + (l << stride2), l, stride2, n); F_fft_dit (A + (2 * l << stride2), l, stride2, n); F_fft_dit (A + (3 * l << stride2), l, stride2, n); } mpz_sub (gt, A3s, A1s); /* gt = -(a1 - a3) */ mpz_add (A1s, A1s, A3s); /* A1 = a1 + a3 */ F_mul_sqrt2exp_2 (A3s, gt, n, n); /* A3 = i * -(a1 - a3) */ mpz_sub (gt, A[0], A2s); /* gt = a0 - a2 */ mpz_add (A[0], A[0], A2s); /* A0 = a0 + a2 */ mpz_sub (A2s, A[0], A1s); /* A2 = a0 - a1 + a2 - a3 */ mpz_add (A[0], A[0], A1s); /* A0 = a0 + a1 + a2 + a3 */ mpz_add (A1s, gt, A3s); /* A1 = a0 - a2 + i * -(a1 - a3) */ mpz_sub (A3s, gt, A3s); /* A3 = a0 - a2 - i * -(a1 - a3) */ for (i = 1, iomega = omega; i < l; i++, iomega += omega) { /* Divide by omega^i. Since sqrt(2)^(4*n) == 1 (mod 2^n+1), this is like multiplying by omega^(4*n-i) */ F_mul_sqrt2exp (A1is, A1is, 4 * n - iomega, n); F_mul_sqrt2exp (A2is, A2is, 4 * n - 2 * iomega, n); F_mul_sqrt2exp (A3is, A3is, 4 * n - 3 * iomega, n); mpz_sub (gt, A3is, A1is); mpz_add (A1is, A1is, A3is); F_mul_sqrt2exp_2 (A3is, gt, n, n); mpz_sub (gt, A0is, A2is); mpz_add (A0is, A0is, A2is); mpz_sub (A2is, A0is, A1is); mpz_add (A0is, A0is, A1is); mpz_add (A1is, gt, A3is); mpz_sub (A3is, gt, A3is); if (1) { F_mod_1 (A0is, n); F_mod_1 (A1is, n); F_mod_1 (A2is, n); F_mod_1 (A3is, n); } } return; } l /= 2; F_fft_dit (A, l, stride2, n); F_fft_dit (A + (l << stride2), l, stride2, n); ADDSUB_MOD(A[0], A1s); for (i = 1, iomega = 4*n - omega; i < l; i++, iomega -= omega) { F_mul_sqrt2exp (A1is, A1is, iomega, n); mpz_sub (gt, A0is, A1is); mpz_add (A0is, A0is, A1is); F_mod_gt (A1is, n); F_mod_1 (A0is, n); } } #define A0 A[i] #define A1 A[l+i] #define A2 A[2*l+i] #define A3 A[3*l+i] #define B0 B[i] #define B1 B[l+i] #define B2 B[2*l+i] #define B3 B[3*l+i] #define C0 C[i] #define C1 C[l+i] #define C2 C[2*l+i] #define C3 C[3*l+i] #define C4 C[4*l+i] #define C5 C[5*l+i] #define C6 C[6*l+i] #define C7 C[7*l+i] #define t0 t[i] #define t1 t[l+i] #define t2 t[2*l+i] #define t3 t[3*l+i] #define t4 t[4*l+i] #define t5 t[5*l+i] static unsigned int F_toomcook4 (mpz_t *C, mpz_t *A, mpz_t *B, unsigned int len, unsigned int n, mpz_t *t) { unsigned int l, i, r; ASSERT(len % 4 == 0); l = len / 4; if (A == B) /* Squaring. The interpolation could probably be optimized, too */ { for (i = 0; i < l; i++) { /*** Evaluate A(2), A(-2), 8*A(1/2) ***/ mpz_mul_2exp (t0, A0, 1); mpz_add (t0, t0, A1); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A2); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A3); /* t[0 .. l-1] = 8*A(1/2) < 15*N */ F_mod_1 (t0, n); mpz_mul_2exp (t2, A3, 2); mpz_add (t2, t2, A1); mpz_mul_2exp (t2, t2, 1); /* t[2l .. 3l-1] = 8*A_3 + 2*A_1 */ mpz_mul_2exp (gt, A2, 2); mpz_add (gt, gt, A0); /* gt = 4*A_2 + A0 */ mpz_sub (t4, gt, t2); /* t[4l .. 5l-1] = A(-2) */ mpz_add (t2, t2, gt); /* t[2l .. 3l-1] = A(2) */ F_mod_1 (t4, n); F_mod_1 (t2, n); /* Evaluate A(1), A(-1) */ mpz_add (C2, A0, A2); /* May overwrite A2 */ mpz_add (gt, A1, A3); mpz_sub (C4, C2, gt); /* C4 = A(-1) */ mpz_add (C2, C2, gt); /* C2 = A(1) < 4*N */ F_mod_1 (C2, n); F_mod_1 (C4, n); } /* A0 A1 A2 A3 */ /* A0 A(1) A3 A(-1) */ /* C0 C1 C2 C3 C4 C5 C6 C7 */ r = F_mul (t, t, t, l, DEFAULT, n, t + 6 * l); /* t0 = (8*A(1/2)) ^ 2 = 64*C(1/2) */ r += F_mul (t + 2 * l, t + 2 * l, t + 2 * l, l, DEFAULT, n, t + 6 * l); /* t2 = A(2) ^ 2 = C(2) */ r += F_mul (t + 4 * l, t + 4 * l, t + 4 * l, l, DEFAULT, n, t + 6 * l); /* t4 = A(-2) ^ 2 = C(-2) */ r += F_mul (C, A, A, l, DEFAULT, n, t + 6 * l); /* C0 = A(0) ^ 2 = C(0) */ r += F_mul (C + 6 * l, A + 3 * l, A + 3 * l, l, DEFAULT, n, t + 6 * l); /* C6 = A(inf) ^ 2 = C(inf) */ r += F_mul (C + 2 * l, C + 2 * l, C + 2 * l, l, DEFAULT, n, t + 6 * l); /* C2 = A(1) ^ 2 = C(1). May overwrite A3 */ r += F_mul (C + 4 * l, C + 4 * l, C + 4 * l, l, DEFAULT, n, t + 6 * l); /* C4 = A(-1) ^ 2 = C(-1) */ } else /* Multiply */ { for (i = 0; i < l; i++) { /*** Evaluate A(2), A(-2), 8*A(1/2) ***/ mpz_mul_2exp (t0, A0, 1); mpz_add (t0, t0, A1); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A2); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A3); /* t[0 .. l-1] = 8*A(1/2) < 15*N */ F_mod_1 (t0, n); mpz_mul_2exp (t2, A3, 2); mpz_add (t2, t2, A1); mpz_mul_2exp (t2, t2, 1); /* t[2l .. 3l-1] = 8*A_3 + 2*A_1 */ mpz_mul_2exp (gt, A2, 2); mpz_add (gt, gt, A0); /* gt = 4*A_2 + A0 */ mpz_sub (t4, gt, t2); /* t[4l .. 5l-1] = A(-2) */ mpz_add (t2, t2, gt); /* t[2l .. 3l-1] = A(2) */ F_mod_1 (t4, n); F_mod_1 (t2, n); /*** Evaluate B(2), B(-2), 8*B(1/2) ***/ mpz_mul_2exp (t1, B0, 1); mpz_add (t1, t1, B1); mpz_mul_2exp (t1, t1, 1); mpz_add (t1, t1, B2); mpz_mul_2exp (t1, t1, 1); mpz_add (t1, t1, B3); /* t[l .. 2l-1] = 8*B(1/2) */ F_mod_1 (t1, n); mpz_mul_2exp (t3, B3, 2); mpz_add (t3, t3, B1); mpz_mul_2exp (t3, t3, 1); /* t[3l .. 4l-1] = 8*B_3 + 2*B_1 */ mpz_mul_2exp (gt, B2, 2); mpz_add (gt, gt, B0); /* gt = 4*B_2 + B0 */ mpz_sub (t5, gt, t3); /* t[5l .. 6l-1] = B(-2) */ mpz_add (t3, t3, gt); /* t[3l .. 4l-1] = B(2) */ F_mod_1 (t5, n); F_mod_1 (t3, n); /* Evaluate A(1), A(-1) */ mpz_add (C2, A0, A2); /* May overwrite A2 */ #undef A2 mpz_add (gt, A1, A3); mpz_set (C1, B0); /* C1 = B(0) May overwrite A1 */ #undef A1 mpz_sub (C4, C2, gt); /* C4 = A(-1). May overwrite B0 */ #undef B0 mpz_add (C2, C2, gt); /* C2 = A(1) < 4*N */ F_mod_1 (C2, n); F_mod_1 (C4, n); /* Evaluate B(1), B(-1) */ mpz_add (gt, C1, B2); /* B0 is in C1 */ mpz_set (C6, A3); /* C6 = A(inf) May overwrite B2 */ #undef B2 mpz_add (C3, B1, B3); /* May overwrite A3 */ #undef A3 mpz_sub (C5, gt, C3); /* C5 = B(-1). May overwrite B1 */ #undef B1 mpz_add (C3, gt, C3); /* C3 = B(1) */ F_mod_1 (C3, n); F_mod_1 (C5, n); } /* A0 A1 A2 A3 B0 B1 B2 B3 */ /* A0 B0 A(1) B(1) A(-1) B(-1) A3 B3 */ /* C0 C1 C2 C3 C4 C5 C6 C7 */ r = F_mul (t, t, t + l, l, DEFAULT, n, t + 6 * l); /* t0 = 8*A(1/2) * 8*B(1/2) = 64*C(1/2) */ r += F_mul (t + 2 * l, t + 2 * l, t + 3 * l, l, DEFAULT, n, t + 6 * l); /* t2 = A(2) * B(2) = C(2) */ r += F_mul (t + 4 * l, t + 4 * l, t + 5 * l, l, DEFAULT, n, t + 6 * l); /* t4 = A(-2) * B(-2) = C(-2) */ r += F_mul (C, A, C + l, l, DEFAULT, n, t + 6 * l); /* C0 = A(0)*B(0) = C(0) */ r += F_mul (C + 2 * l, C + 2 * l, C + 3 * l, l, DEFAULT, n, t + 6 * l); /* C2 = A(1)*B(1) = C(1) */ r += F_mul (C + 4 * l, C + 4 * l, C + 5 * l, l, DEFAULT, n, t + 6 * l); /* C4 = A(-1)*B(-1) = C(-1) */ r += F_mul (C + 6 * l, C + 6 * l, B + 3 * l, l, DEFAULT, n, t + 6 * l); /* C6 = A(inf)*B(inf) = C(inf) */ } /* C(0) C(1) C(-1) C(inf) 64*C(1/2) C(2) C(-2) */ /* C0,C1 C2,C3 C4,C5 C6,C7 t0,t1 t2,t3 t4,t5 */ for (i = 0; i < 2 * l - 1; i++) { mpz_add (t0, t0, t2); /* t0 = 65 34 20 16 20 34 65 */ mpz_sub (gt, C2, C4); /* gt = 2*C_odd(1) = 0 2 0 2 0 2 0 */ mpz_add (C2, C2, C4); /* C2 = 2*C_even(1) = 2 0 2 0 2 0 2 */ F_divby2 (C2, C2, n); /* C2 = C_even(1) */ mpz_add (C4, t2, t4); /* C4 = 2*C_even(2) */ F_divby2 (C4, C4, n); /* C4 = C_even(2) */ mpz_sub (t4, t2, t4); /* t4 = 2*C_odd(2) */ F_divby2 (t4, t4, n); F_divby2 (t4, t4, n); /* t4 = C_odd(2)/2 = C_1 + 4*C_3 + 16*C_5 */ F_divby2 (t2, gt, n); /* t2 = C_odd(1) */ mpz_sub (t0, t0, gt); /* t0 = 65 32 20 14 20 32 65 */ mpz_mul_2exp (gt, gt, 4); mpz_sub (t0, t0, gt); /* t0 = 65 0 20 -18 20 0 65 */ mpz_add (gt, C0, C6); /* gt = C_0 + C_6 */ mpz_sub (C2, C2, gt); /* C2 = C_2 + C_4 */ mpz_sub (t0, t0, gt); /* t0 = 64 0 20 -18 20 0 64 */ mpz_mul_2exp (gt, gt, 5); /* gt = 32*C_0 + 32*C_6 */ F_divby2 (t0, t0, n); /* t0 = 32 0 10 -9 10 0 32 */ mpz_sub (t0, t0, gt); /* t0 = 0 0 10 -9 10 0 0 */ mpz_sub (t0, t0, C2); /* t0 = 0 0 9 -9 9 0 0 */ F_divby3_1 (t0, n); F_divby3_1 (t0, n); /* t0 = 0 0 1 -1 1 0 0 */ mpz_sub (t0, C2, t0); /* t0 = C_3 */ mpz_sub (t2, t2, t0); /* t2 = C_1 + C_5 */ mpz_mul_2exp (gt, t0, 2); /* gt = 4*C_3 */ mpz_sub (t4, t4, gt); /* t4 = C_1 + 16*C_5 */ mpz_sub (t4, t4, t2); /* t4 = 15*C_5 */ F_divby3_1 (t4, n); F_divby5_1 (t4, n); /* t4 = C_5 */ mpz_sub (t2, t2, t4); /* t2 = C_1 */ mpz_sub (C4, C4, C0); /* C4 = 4*C_2 + 16*C_4 + 64*C_6 */ F_divby2 (C4, C4, n); F_divby2 (C4, C4, n); /* C4 = C_2 + 4*C_4 + 16*C_6 */ mpz_mul_2exp (gt, C6, 4); mpz_sub (C4, C4, gt); /* C4 = C_2 + 4*C_4 */ mpz_sub (C4, C4, C2); /* C4 = 3*C_4 */ F_divby3_1 (C4, n); /* C4 = C_4 */ mpz_sub (C2, C2, C4); /* C2 = C_2 */ } for (i = 0; i < l - 1; i++) { mpz_add (C1, C1, t2); F_mod_1 (C1, n); } mpz_set (C1, t2); F_mod_1 (C1, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C1, C1, t2); F_mod_1 (C1, n); } for (i = 0; i < l - 1; i++) { mpz_add (C3, C3, t0); F_mod_1 (C3, n); } mpz_set (C3, t0); F_mod_1 (C3, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C3, C3, t0); F_mod_1 (C3, n); } for (i = 0; i < l - 1; i++) { mpz_add (C5, C5, t4); F_mod_1 (C5, n); } mpz_set (C5, t4); F_mod_1 (C5, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C5, C5, t4); F_mod_1 (C5, n); } return r; } /* Karatsuba split. Calls F_mul() to multiply the three pieces. */ static unsigned int F_karatsuba (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int len, unsigned int n, mpz_t *t) { unsigned int i, r; ASSERT(len % 2 == 0); len /= 2; if (A == B) /* Squaring */ { r = F_mul (t, A, A + len, len, DEFAULT, n, t + 2 * len); /* A0 * A1 */ r += F_mul (R + 2 * len, A + len, A + len, len, DEFAULT, n, t + 2 * len); /* A1^2 */ r += F_mul (R, A, A, len, DEFAULT, n, t + 2 * len); /* A0^2 */ for (i = 0; i < 2 * len - 1; i++) { mpz_mul_2exp (t[i], t[i], 1); mpz_add (R[i + len], R[i + len], t[i]); /* i==len could be a mpz_set */ } return r; } for (i = 0; i < len; i++) { mpz_add (t[i], A[i], A[i + len]); /* t0 = A0 + A1 */ mpz_add (t[i + len], B[i], B[i + len]); /* t1 = B0 + B1 */ } r = F_mul (t, t, t + len, len, DEFAULT, n, t + 2 * len); /* t[0...2*len-1] = (A0+A1) * (B0+B1) = A0*B0 + A0*B1 + A1*B0 + A1*B1 */ if (R != A) { r += F_mul (R, A, B, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0 */ r += F_mul (R + 2 * len, A + len, B + len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1, may overwrite B */ } else if (R + 2 * len != B) { r += F_mul (R + 2 * len, A + len, B + len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1 */ r += F_mul (R, A, B, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0, overwrites A */ } else /* R == A && R + 2*len == B */ { for (i = 0; i < len; i++) { /* mpz_swap instead? Perhaps undo later? Or interface for F_mul to specify separate result arrays for high/low half? */ mpz_set (gt, A[len + i]); /* Swap A1 and B0 */ mpz_set (A[len + i], B[i]); mpz_set (B[i], gt); } r += F_mul (R, R, R + len, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0, overwrites A */ r += F_mul (R + 2 * len, R + 2 * len, R + 3 * len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1, overwrites B */ } /* R[0...2*len-2] == A0*B0, R[2*len-1] == 0 */ /* R[2*len...3*len-2] == A1*B1, R[4*len-1] == 0 */ /* t[0...2*len-2] == (A0+A1)*(B0+B1), t[2*len-1] == 0 */ /* We're doing indices i and i+len in one loop on the assumption that 6 residues will probably fit into cache. After all, Karatsuba is only called for smallish F_m. This way, the final add R[i+len] += t[i] can be done inside the same loop and we need only one pass over main memory. */ for (i = 0; i < len - 1; i++) { mpz_sub (t[i], t[i], R[i]); /* t = A0*B1 + A1*B0 + A1*B1 */ mpz_sub (t[i], t[i], R[i + 2 * len]); /* t = A0*B1 + A1*B0 */ mpz_sub (t[i + len], t[i + len], R[i + len]); mpz_sub (t[i + len], t[i + len ], R[i + 3 * len]); mpz_add (R[i + len], R[i + len], t[i]); mpz_add (R[i + 2 * len], R[i + 2 * len], t[i + len]); } mpz_sub (t[len - 1], t[len - 1], R[len - 1]); mpz_sub (R[2 * len - 1], t[len - 1], R[3 * len - 1]); return r; } /* Multiply two polynomials with coefficients modulo 2^(2^m)+1. */ /* len is length (=degree+1) of polynomials and must be a power of 2. */ /* n=2^m */ /* Return value: number of multiplies performed, or UINT_MAX in case of error */ unsigned int F_mul (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int len, int parameter, unsigned int n, mpz_t *t) { unsigned int i, r=0; unsigned int transformlen = (parameter == NOPAD) ? len : 2 * len; #ifdef CHECKSUM mpz_t chksum1, chksum_1, chksum0, chksuminf; #endif /* Handle trivial cases */ if (len == 0) return 0; if (!gt_inited) { mpz_init2 (gt, 2 * n); gt_inited = 1; } if (len == 1) { if (parameter == MONIC) { /* (x + a0)(x + b0) = x^2 + (a0 + b0)x + a0*b0 */ mpz_add (gt, A[0], B[0]); F_mod_gt (t[0], n); F_mulmod (R[0], A[0], B[0], n); /* May overwrite A[0] */ mpz_set (R[1], t[0]); /* May overwrite B[0] */ /* We don't store the leading 1 monomial in the result poly */ } else { F_mulmod (R[0], A[0], B[0], n); /* May overwrite A[0] */ mpz_set_ui (R[1], 0); /* May overwrite B[0] */ } return 1; } #ifdef CHECKSUM mpz_init2 (chksum1, n+64); mpz_init2 (chksum_1, n+64); mpz_init2 (chksum0, n+64); mpz_init2 (chksuminf, n+64); mpz_set_ui (gt, 0); for (i = 0; i < len; i++) { /* Compute A(1) and B(1) */ mpz_add (chksum1, chksum1, A[i]); mpz_add (gt, gt, B[i]); /* Compute A(-1) and B(-1) */ if (i % 2 == 0) { mpz_add (chksum_1, chksum_1, A[i]); mpz_add (chksum0, chksum0, B[i]); /* chksum0 used temporarily here */ } else { mpz_sub (chksum_1, chksum_1, A[i]); mpz_sub (chksum0, chksum0, B[i]); } } if (parameter == MONIC) { mpz_add_ui (chksum1, chksum1, 1); mpz_add_ui (gt, gt, 1); mpz_add_ui (chksum_1, chksum_1, 1); mpz_add_ui (chksum0, chksum0, 1); } mpz_mul (gt, gt, chksum1); F_mod_gt (chksum1, n); mpz_mul (gt, chksum0, chksum_1); F_mod_gt (chksum_1, n); /* Compute A(0) * B(0) */ mpz_mul (gt, A[0], B[0]); F_mod_gt (chksum0, n); /* Compute A(inf) * B(inf) */ mpz_mul (gt, A[len - 1], B[len - 1]); F_mod_gt (chksuminf, n); if (parameter == MONIC) { mpz_add (chksuminf, chksuminf, A[len - 2]); mpz_add (chksuminf, chksuminf, B[len - 2]); } r += 4; #endif /* CHECKSUM */ /* Don't do FFT if len =< 4 (Karatsuba or Toom-Cook are faster) unless we do a transform without zero padding, or if transformlen > 4*n (no suitable primitive roots of 1) */ if ((len > 4 || parameter == NOPAD) && transformlen <= 4 * n) { unsigned int len2; /* len2 = log_2(transformlen). Assumes transformlen > 0 */ for (i = transformlen, len2 = 0; (i&1) == 0; i >>= 1, len2++); if (i != 1) { outputf (OUTPUT_ERROR, "F_mul: polynomial length must be power of 2, " "but is %d\n", len); return UINT_MAX; } /* Are we performing a squaring or multiplication? */ if (A != B) { /* So it's a multiplication */ /* Put transform of B into t */ for (i = 0; i < len; i++) mpz_set (t[i], B[i]); if (parameter == MONIC) mpz_set_ui (t[i++], 1); for (; i < transformlen; i++) mpz_set_ui (t[i], 0); F_fft_dif (t, transformlen, 0, n); } else t = R; /* Do squaring */ /* Put A into R */ for (i = 0; i < len; i++) mpz_set (R[i], A[i]); if (parameter == MONIC) mpz_set_ui (R[i++], 1); /* May overwrite B[0] */ for (; i < transformlen; i++) mpz_set_ui (R[i], 0); /* May overwrite B[i - len] */ F_fft_dif (R, transformlen, 0, n); for (i = 0; i < transformlen; i++) { F_mulmod (R[i], R[i], t[i], n); /* Do the div-by-length. Transform length was transformlen, len2 = log_2 (transformlen), so divide by 2^(len2) = sqrt(2)^(2*len2) */ F_mul_sqrt2exp (R[i], R[i], - 2 * len2, n); } r += transformlen; F_fft_dit (R, transformlen, 0, n); if (parameter == MONIC) mpz_sub_ui (R[0], R[0], 1); } else { /* Karatsuba or Toom-Cook split */ if (parameter == NOPAD) { outputf (OUTPUT_ERROR, "F_mul: cyclic/short products not supported " "by Karatsuba/Toom-Cook\n"); return UINT_MAX; } if (len / n == 4 || len == 2) r += F_karatsuba (R, A, B, len, n, t); else r += F_toomcook4 (R, A, B, len, n, t); if (parameter == MONIC) /* Handle the leading monomial the hard way */ { /* This only works if A, B and R do not overlap */ if (A == R || B == R + len) { outputf (OUTPUT_ERROR, "F_mul: monic polynomials with Karatsuba/" "Toom-Cook and overlapping input/output not supported\n"); return UINT_MAX; } for (i = 0; i < len; i++) { mpz_add (R[i + len], R[i + len], A[i]); mpz_add (R[i + len], R[i + len], B[i]); F_mod_1 (R[i + len], n); } } } #ifdef DEBUG if (parameter != MONIC && parameter != NOPAD) { F_mod_1 (R[transformlen - 1], n); if (mpz_sgn (R[transformlen - 1]) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: R[%d] == %Zd != 0\n", len, transformlen - 1, R[transformlen - 1]); } #endif #ifdef CHECKSUM /* Compute R(1) = (A*B)(1) and subtract from chksum1 */ for (i = 0; i < transformlen; i++) mpz_sub (chksum1, chksum1, R[i]); if (parameter == MONIC) mpz_sub_ui (chksum1, chksum1, 1); while (mpz_sizeinbase (chksum1, 2) > n) F_mod_1 (chksum1, n); if (mpz_sgn (chksum1) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(1)*B(1) != R(1), difference %Zd\n", len, chksum1); /* Compute R(-1) = (A*B)(-1) and subtract from chksum_1 */ for (i = 0; i < transformlen; i++) if (i % 2 == 0) mpz_sub (chksum_1, chksum_1, R[i]); else mpz_add (chksum_1, chksum_1, R[i]); if (parameter == MONIC) mpz_sub_ui (chksum_1, chksum_1, 1); while (mpz_sizeinbase (chksum_1, 2) > n) F_mod_1 (chksum_1, n); if (mpz_sgn (chksum_1) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(-1)*B(-1) != R(-1), difference %Zd\n", len, chksum_1); if (parameter != NOPAD) { mpz_sub (chksum0, chksum0, R[0]); while (mpz_sizeinbase (chksum0, 2) > n) F_mod_1 (chksum0, n); if (mpz_sgn (chksum0) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(0)*B(0) != R(0), difference %Zd\n", len, chksum0); mpz_sub (chksuminf, chksuminf, R[transformlen - 2]); while (mpz_sizeinbase (chksuminf, 2) > n) F_mod_1 (chksuminf, n); if (mpz_sgn (chksuminf) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(inf)*B(inf) != R(inf), difference %Zd\n", len, chksuminf); } mpz_clear (chksum1); mpz_clear (chksum_1); mpz_clear (chksum0); mpz_clear (chksuminf); #endif /* CHECKSUM */ return r; } /* Transposed multiply of two polynomials with coefficients modulo 2^(2^m)+1. lenB is the length of polynomial B and must be a power of 2, lenA is the length of polynomial A and must be lenB / 2 or lenB / 2 + 1. n=2^m t must have space for 2*lenB coefficients Only the product coefficients [lenA - 1 ... lenA + lenB/2 - 2] will go into R[0 ... lenB / 2 - 1] Return value: number of multiplies performed, UINT_MAX in error case. */ unsigned int F_mul_trans (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int lenA, unsigned int lenB, unsigned int n, mpz_t *t) { unsigned int i, r = 0, len2; /* Handle trivial cases */ if (lenB < 2) return 0; ASSERT(lenA == lenB / 2 || lenA == lenB / 2 + 1); if (!gt_inited) { mpz_init2 (gt, 2 * n); gt_inited = 1; } if (lenB == 2) { F_mulmod (R[0], A[0], B[0], n); return 1; } if (lenB <= 4 * n) { /* len2 = log_2(lenB) */ for (i = lenB, len2 = 0; i > 1 && (i&1) == 0; i >>= 1, len2++); if (i != 1) { outputf (OUTPUT_ERROR, "F_mul_trans: polynomial length must be power of 2, " "but is %d\n", lenB); return UINT_MAX; } /* Put transform of B into t */ for (i = 0; i < lenB; i++) mpz_set (t[i], B[i]); F_fft_dif (t, lenB, 0, n); /* Put transform of reversed A into t + lenB */ for (i = 0; i < lenA; i++) mpz_set (t[i + lenB], A[lenA - 1 - i]); for (i = lenA; i < lenB; i++) mpz_set_ui (t[i + lenB], 0); F_fft_dif (t + lenB, lenB, 0, n); for (i = 0; i < lenB; i++) { F_mulmod (t[i], t[i], t[i + lenB], n); /* Do the div-by-length. Transform length was len, so divide by 2^len2 = sqrt(2)^(2*len2) */ F_mul_sqrt2exp (t[i], t[i], - 2 * len2, n); } r += lenB; F_fft_dit (t, lenB, 0, n); for (i = 0; i < lenB / 2; i++) mpz_set (R[i], t[i + lenA - 1]); } else { /* Only Karatsuba, no Toom-Cook here */ unsigned int h = lenB / 4; const unsigned int lenA0 = h, lenA1 = lenA - h; outputf (OUTPUT_DEVVERBOSE, "schoen_strass.c: Transposed Karatsuba, " "lenA = %lu, lenB = %lu\n", lenA, lenB); /* A = a1 * x^h + a0 B = b3 * x^3h + b2 * x^2h + b1 * x^h + b0 mul^T(A, B) = mul^T(a0,b3) * x^4h + (mul^T(a1,b3) + mul^T(a0,b2)) * x^3h + (mul^T(a1,b2) + mul^T(a0,b1)) * x^2h + (mul^T(a1,b1) + mul^T(a0,b0)) * x + mul^T(a1,b0) We only want the x^h, x^2h and x^3h coefficients, mul^T(a1,b1) + mul^T(a0,b0) mul^T(a1,b2) + mul^T(a0,b1) mul^T(a1,b3) + mul^T(a0,b2) Specifically, we want R[i] = \sum_{j=0}^{lenA} A[j] * B[j+i], 0 <= i < 2h */ /* T */ for (i = 0; i < h; i++) mpz_add (t[i], A[i], A[i + h]); if (lenA1 == h + 1) mpz_set (t[h], A[2*h]); r = F_mul_trans (t, t, B + h, lenA1, 2 * h, n, t + lenA1); /* Uses t[h ... 5h-1] as temp */ /* U */ for (i = 0; i < 2 * h; i++) mpz_sub (t[i + h], B[i], B[h + i]); r += F_mul_trans (t + h, A, t + h, lenA0, 2 * h, n, t + 3 * h); /* Uses t[3h ... 7h-1] as temp */ for (i = 0; i < h; i++) mpz_add (R[i], t[i], t[i + h]); /* R[0 ... h-1] = t + r */ /* V */ for (i = 0; i < 2 * h; i++) mpz_sub (t[i + h], B[i + 2 * h], B[i + h]); r += F_mul_trans (t + h, A + h, t + h, lenA1, 2 * h, n, t + 3 * h); /* Uses t[3h ... 7h - 1] as temp */ for (i = 0; i < h; i++) mpz_add (R[i + h], t[i], t[i + h]); } return r; } void F_clear () { if (gt_inited) mpz_clear (gt); gt_inited = 0; } ecm-6.4.4/ecm-params.h.pentium-m0000644023561000001540000000117512106741273013332 00000000000000/* those parameters were obtained on toto.loria.fr with ecm-6.3-rc3 gmp-5.0.1, and gcc 4.0.2 -m32 -O2 -pedantic -fomit-frame-pointer -mtune=pentium3 -march=pentium3 */ #define MPZMOD_THRESHOLD 98 #define REDC_THRESHOLD 398 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 12, 12, 1, 14, 12, 13, 1, 15, 16, 15, 16, 19, 20, 22} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 256 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 1024 ecm-6.4.4/candi.c0000644023561000001540000002030112106741273010434 00000000000000/* Encapsulated candidate. This candidate should have been a C++ class, but since we are using straight C for this project, I guess I can deal with it. Copyright 2003, 2004, 2005, 2006 Jim Fougeron, Paul Zimmermann. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include "ecm-ecm.h" #define VALID_MAGIC 0x24837BF5 #define DEAD_MAGIC 0xDEADBEEF #if defined (CANDI_DEBUG) static void Candi_Validate (const char *FunctionStr, const mpcandi_t *n) { int abrt = 0; if (!FunctionStr) { fprintf (stderr, "ERROR, UNKNOWN FUNCTION, can NOT continue checks!\n"); exit(-1); } if (!n) { abrt = fprintf (stderr, "ERROR, %s() *n was NULL, can NOT continue checks!\n", FunctionStr); exit(-1); } if (n->magic != VALID_MAGIC) abrt = fprintf (stderr, "ERROR, %s() VALID_MAGIC not valid\n", FunctionStr); if (n->cpExpr && n->nexprlen != strlen(n->cpExpr)) abrt = fprintf (stderr, "ERROR, %s() Invalid cpExpr length\n", FunctionStr); if (n->ndigits != nb_digits(n->n)) abrt = fprintf (stderr, "ERROR, %s() Invalid n->ndigits length\n", FunctionStr); if (abrt) exit(-1); } #endif void mpcandi_t_init (mpcandi_t *n) { n->cpExpr = NULL; n->nexprlen = 0; n->ndigits = 1; mpz_init_set_ui (n->n, 1); n->isPrp = 0; #if defined (CANDI_DEBUG) n->magic = VALID_MAGIC; Candi_Validate ("mpcandi_t_init", n); #endif } void mpcandi_t_free (mpcandi_t *n) { #if defined (CANDI_DEBUG) Candi_Validate("mpcandi_t_free", n); #endif if (n->cpExpr) free (n->cpExpr); n->cpExpr = NULL; n->nexprlen = 0; n->ndigits = 0; mpz_clear (n->n); n->isPrp = 1; /* "default" to prp, so that if the candidate does not get filled in, it will not be tested */ #if defined (CANDI_DEBUG) n->magic = DEAD_MAGIC; #endif } /* performs a safe "deep" copy */ int mpcandi_t_copy (mpcandi_t *to, mpcandi_t *from) { #if defined (CANDI_DEBUG) Candi_Validate("Pre mpcandi_t_copy", to); Candi_Validate("Pre mpcandi_t_copy", from); #endif if (to == from) return 1; if (to->cpExpr) free(to->cpExpr); to->cpExpr = NULL; if (from->cpExpr) { to->cpExpr = (char *) malloc(from->nexprlen+1); if (to->cpExpr == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } strcpy(to->cpExpr, from->cpExpr); } to->nexprlen = from->nexprlen; mpz_set(to->n, from->n); to->isPrp = from->isPrp; to->ndigits = from->ndigits; #if defined (CANDI_DEBUG) Candi_Validate("Post mpcandi_t_copy", to); Candi_Validate("Post mpcandi_t_copy", from); #endif return 1; } int mpcandi_t_add_candidate (mpcandi_t *n, mpz_t c, const char *cpExpr, int primetest) { #if defined (CANDI_DEBUG) Candi_Validate("Pre mpcandi_t_add_candidate", n); #endif if (n->cpExpr) free (n->cpExpr); n->cpExpr = NULL; if (cpExpr) { n->nexprlen = strlen (cpExpr); n->cpExpr = (char *) malloc (n->nexprlen + 1); if (n->cpExpr == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } strcpy (n->cpExpr, cpExpr); } mpz_set (n->n, c); if (primetest) n->isPrp = probab_prime_p (c, PROBAB_PRIME_TESTS); else n->isPrp = 0; /* there is a candidate there now, and the user did not tell us to prp it, so assume it is composite */ n->ndigits = nb_digits (c); #if defined (CANDI_DEBUG) Candi_Validate("Post mpcandi_t_add_candidate", n); #endif return 1; } int mpcandi_t_addfoundfactor_d (mpcandi_t *n, double f) { #if defined (CANDI_DEBUG) Candi_Validate("Pre mpcandi_t_addfoundfactor_d", n); #endif int ret; mpz_t t; mpz_init_set_d(t,f); /* do not display a warning if this factor does not divide the remaining cofactor. This function is called repeatedly (until it fails) to remove all traces of the prime factor. It is highly likely that these smaller factors will be non square-free within the candidate when starting. A return of zero is exprected by the calling trial divider, as that tells it that all residue of the factor has been eliminated */ ret = mpcandi_t_addfoundfactor (n, t, 0); mpz_clear (t); #if defined (CANDI_DEBUG) Candi_Validate("Post mpcandi_t_addfoundfactor_d", n); #endif return ret; } int mpcandi_t_addfoundfactor (mpcandi_t *n, mpz_t f, int displaywarning) { #if defined (CANDI_DEBUG) Candi_Validate("Pre mpcandi_t_addfoundfactor_d", n); #endif char *cp, *cp1; if (!mpz_divisible_p (n->n, f)) { /* ERROR was not a factor NOTE however, that this is "valid" for the ui() function to call. When trial dividing, it is VERY frequent to be divisible by 2^3, and we try to remove factors UNTIL */ if (displaywarning) gmp_fprintf (stderr, "ECM logic ERROR. Trying to remove a " "non-factor %Zd\n", f); #if defined (CANDI_DEBUG) Candi_Validate("Post (no factor removed) mpcandi_t_addfoundfactor_d", n); #endif return 0; } /* remove f from n->n */ mpz_divexact (n->n, n->n, f); n->ndigits = nb_digits (n->n); n->isPrp = probab_prime_p (n->n, PROBAB_PRIME_TESTS); if (n->cpExpr != NULL) { /* If there is an expression, then lets preserve it */ cp1 = mpz_get_str (NULL, 10, f); cp = (char *) malloc(n->nexprlen+1 + 3 + strlen(cp1)); /* +1 for null, +3 for ()/ */ if (cp == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } sprintf (cp, "(%s)/%s", n->cpExpr, cp1); free(n->cpExpr); n->cpExpr = cp; n->nexprlen += (3+strlen(cp1)); FREE (cp1, strlen (cp1) + 1); } #if defined (CANDI_DEBUG) Candi_Validate("Post (removed factor) mpcandi_t_addfoundfactor_d", n); #endif return 1; } /********************************************************************** Group order candidate functions. These wrap the logic for the -go command line switch which allows the user to "insert" the proper group order. **********************************************************************/ void mpgocandi_t_init (mpgocandi_t *go) { go->cpOrigExpr = NULL; mpcandi_t_init (&(go->Candi)); go->containsN = 0; go->Valid = 0; } void mpgocandi_t_free (mpgocandi_t *go) { if (go->cpOrigExpr) free (go->cpOrigExpr); mpcandi_t_free (&(go->Candi)); go->Valid = 0; } int mpgocandi_fixup_with_N (mpgocandi_t *go, mpcandi_t *n) { int NumNs, len; char *cp, *cpo, *numbuf; if (go->Valid == 0) return 0; if (go->containsN == 0) return 1; /* a valid "normal" expression does not need updating */ cp = strchr (go->cpOrigExpr, 'N'); NumNs = 0; while (cp) { ++NumNs; cp = strchr (&cp[1], 'N'); } /* compute size of string needed, and add some safety buffer to it */ cp = go->cpOrigExpr; len = NumNs * mpz_sizeinbase (n->n, 10) + strlen (cp) + 100; numbuf = (char *) malloc(len); if (numbuf == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } cpo = numbuf; while (*cp) { if (*cp == 'N') cpo += gmp_sprintf (cpo, "%Zi", n->n); else *cpo++ = *cp; ++cp; } *cpo = 0; /* Null terminate the string correctly. */ if (eval_str (&(go->Candi), numbuf, 0, NULL)) go->Valid = 1; else { static int warned = 0; if (!warned) { warned = 1; fprintf(stderr, "Warning, invalid expression %s for the -go option\n", go->cpOrigExpr); } go->Valid = 0; /* it is not valid, so do not use it */ } free (numbuf); return go->Valid; } ecm-6.4.4/eval.c0000644023561000001540000004401112106741274010312 00000000000000/* Simple expression parser for GMP-ECM. Copyright 2003, 2004, 2005, 2006, 2007, 2008, 2012 Jim Fougeron, Paul Zimmermann and Alexander Kruppa. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include "ecm-ecm.h" #ifdef HAVE_STRINGS_H # include /* for strncasecmp */ #endif #ifdef HAVE_CTYPE_H # include #endif /***************************************************************** * Syntax for this VERY simple recursive expression parser: * * * * ( or [ or { along with ) or ] or } are valid for grouping * * Normal "simple" operators: + - * / (. can be used for *) * * Module: n%m 345%11 * * Unary minus is supported: -n -500 * * Exponentation: n^m 2^500 * * Simple factorial: n! 53! == 1*2*3*4...*52*53 * * Multi-factorial: n!m 15!3 == 15.12.9.6.3 * * Simple Primorial: n# 11# == 2*3*5*7*11 * * Reduced Primorial: n#m 17#5 == 5.7.11.13.17 * * * * Adding (working on these at least: * * Phi(x,n) * * * * NOTE Lines ending in a \ character are "joined" * * NOTE Lines starting with #are comments * * NOTE C++ // single line comments (rest of line is a comment) * * * ****************************************************************/ /* value only used by the expression parser */ static mpz_t t, mpOne; static char *expr_str; static void eval_power (mpz_t prior_n, mpz_t n,char op); static void eval_product (mpz_t prior_n, mpz_t n,char op); static void eval_sum (mpz_t prior_n, mpz_t n,char op); static int eval_Phi (mpz_t prior_n, mpz_t n, int ParamCnt); static int eval_2 (int bInFuncParams); #if 0 /* strncasecmp is a required function in configure.in */ #if defined (_MSC_VER) || defined (__MINGW32__) #define strncasecmp strnicmp #endif #endif /**************************************/ /* Main expression evalation function */ /* This is the function that the app */ /* calls to read the expression line */ /**************************************/ int eval (mpcandi_t *n, FILE *fd, int primetest) { int ret; int nMaxSize = 2000, nCurSize = 0; int c; char *expr = (char *) malloc (nMaxSize + 1); if (expr == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } /* Lines ending in '\\' are "joined" as a single longer line */ JoinLinesLoop:; c = fgetc (fd); if (c == '#') { ChompLine:; do c = fgetc (fd); while (c != EOF && !IS_NEWLINE(c)); if (IS_NEWLINE(c)) goto JoinLinesLoop; } while (c != EOF && !IS_NEWLINE(c) && c != ';') { if (c == '/') { /* This might be a C++ // comment or it might be a / division operator. Check it out, and if it is a comment, then "eat it" */ int peek_c = fgetc (fd); if (peek_c == '/') /* Got a C++ single line comment, so Chomp the line */ goto ChompLine; /* Put the char back on the file, then allow the code to add the '/' char to the buffer */ ungetc (peek_c, fd); } /* strip space and tabs out here, and then we DON'T have to mess with them in the rest of the parser */ if (!isspace (c)) expr[nCurSize++] = (char) c; if (nCurSize == nMaxSize) { char *cp; nMaxSize += nMaxSize / 2; cp = (char *) realloc (expr, nMaxSize + 1); if (!cp) { free (expr); fprintf (stderr, "Severe warning!, out of core memory reading number!\n"); exit (EXIT_FAILURE); } expr = cp; } c = fgetc (fd); } expr[nCurSize] = 0; if (!nCurSize) ret = 0; else { if (expr[nCurSize-1] == '\\') { /* remove the '\\' char, and then process the next line */ expr[--nCurSize] = 0; goto JoinLinesLoop; } if (c == ';') ungetc (c, fd); mpz_init (t); expr_str = expr; ret = eval_2 (0); if (ret) { char *s; char *cpTmpExpr = expr; s = mpz_get_str (NULL, 10, t); if (!strcmp(s, cpTmpExpr)) cpTmpExpr = NULL; ret = mpcandi_t_add_candidate (n, t, cpTmpExpr, primetest); FREE (s, strlen (s) + 1); } mpz_clear(t); } free(expr); return ret; } int eval_str (mpcandi_t *n, char *cp, int primetest, char **EndChar) { int ret; int nMaxSize=2000, nCurSize=0; char *c; char *expr = (char *) malloc(nMaxSize+1); if (expr == NULL) { fprintf (stderr, "Error: not enough memory\n"); exit (EXIT_FAILURE); } /* Lines ending in '\\' are "joined" as a single longer line */ c = cp; JoinLinesLoop:; if (*c == '#') { do ++c; while (*c && !IS_NEWLINE(*c)); if (IS_NEWLINE(*c)) goto JoinLinesLoop; } while (*c && !IS_NEWLINE(*c) && *c != ';') { /* strip space and tabs out here, and then we DON'T have to mess with them in the rest of the parser */ if (!isspace((int) *c)) expr[nCurSize++] = *c; if (nCurSize == nMaxSize) { char *cp; nMaxSize += 5000; cp = (char *) realloc (expr, nMaxSize + 1); if (!cp) { free(expr); fprintf(stderr, "Severe warning!, out of core memory reading number!\n"); exit (EXIT_FAILURE); } expr = cp; } ++c; } expr[nCurSize] = 0; if (!nCurSize) ret = 0; else { if (expr[nCurSize-1] == '\\') { /* remove the '\\' char, and then process the next line */ expr[--nCurSize] = 0; goto JoinLinesLoop; } if (*c != ';') ++c; mpz_init(t); expr_str = expr; ret = eval_2(0); if (ret) { char *s; char *cpTmpExpr = expr; s = mpz_get_str (NULL, 10, t); if (!strcmp(s, cpTmpExpr)) cpTmpExpr = NULL; ret = mpcandi_t_add_candidate(n, t, cpTmpExpr, primetest); FREE (s, strlen (s) + 1); } mpz_clear(t); } free(expr); if (EndChar && *EndChar) *EndChar = c; return ret; } void eval_power (mpz_t prior_n, mpz_t n,char op) { #if defined (DEBUG_EVALUATOR) if ('#'==op || '^'==op || '!'==op || '@'==op || '$'==op) { fprintf (stderr, "eval_power "); mpz_out_str(stderr, 10, prior_n); fprintf (stderr, "%c", op); mpz_out_str(stderr, 10, n); fprintf (stderr, "\n"); } #endif if ('^'==op) mpz_pow_ui(n,prior_n,mpz_get_ui(n)); else if ('!'==op) /* simple factorial (syntax n! example: 7! == 1*2*3*4*5*6*7) */ mpz_fac_ui(n,mpz_get_ui(n)); else if ('@'==op) /* Multi factorial (syntax n!prior_n. example: 15!3 == 15*12*9*6*3) */ { long nCur; unsigned long nDecr; nCur = mpz_get_si(prior_n); nDecr = mpz_get_ui(n); mpz_set_ui(n,1); /*printf ("Multi-factorial %ld!%ld\n", nCur, nDecr);*/ while (nCur > 1) { /* This could be done much more efficiently (bunching mults using smaller "built-ins"), but I am not going to bother for now */ mpz_mul_ui(n,n,nCur); nCur -= nDecr; } } else if ('#'==op) /* simple primorial (syntax n# example: 11# == 2*3*5*7*11 */ { long nMax; double p; nMax = mpz_get_si(n); mpz_set_ui(n,1); getprime_clear (); /* free the prime tables, and reinitialize */ for (p = 2.0; p <= nMax; p = getprime ()) /* This could be done much more efficiently (bunching mults using smaller "built-ins"), but I am not going to bother for now */ mpz_mul_ui(n,n,(unsigned)p); } else if ('$'==op) /* reduced primorial (syntax n#prior_n example: 13#5 == (5*7*11*13) */ { double p; long nMax; unsigned long nStart; nMax = mpz_get_si(prior_n); nStart = mpz_get_ui(n); mpz_set_ui(n,1); getprime_clear (); /* free the prime tables, and reinitialize */ p = getprime (nStart); /*printf ("Reduced-primorial %ld#%ld\n", nMax, nStart);*/ for (; p <= nMax; p = getprime (p)) { /* Unfortunately, the SoE within GMP-ECM does not always start correctly, so we have to skip the low end stuff by hand */ if (p >= nStart) /* This could be done much more efficiently (bunching mults using smaller "built-ins"), but I am not going to bother for now */ mpz_mul_ui(n,n,(unsigned)p); } } } void eval_product (mpz_t prior_n, mpz_t n, char op) { #if defined (DEBUG_EVALUATOR) if ('*'==op || '.'==op || '/'==op || '%'==op) { fprintf (stderr, "eval_product "); mpz_out_str(stderr, 10, prior_n); fprintf (stderr, "%c", op); mpz_out_str(stderr, 10, n); fprintf (stderr, "\n"); } #endif if ('*' == op || '.' == op) mpz_mul (n, prior_n, n); else if ('/' == op) { mpz_t r; mpz_init (r); mpz_tdiv_qr (n, r, prior_n, n); if (mpz_cmp_ui (r, 0) != 0) { fprintf (stderr, "Parsing Error: inexact division\n"); exit (EXIT_FAILURE); } mpz_clear (r); } else if ('%' == op) mpz_tdiv_r (n, prior_n, n); } void eval_sum (mpz_t prior_n, mpz_t n,char op) { #if defined (DEBUG_EVALUATOR) if ('+'==op || '-'==op) { fprintf (stderr, "eval_sum "); mpz_out_str(stderr, 10, prior_n); fprintf (stderr, "%c", op); mpz_out_str(stderr, 10, n); fprintf (stderr, "\n"); } #endif if ('+' == op) mpz_add(n,prior_n,n); else if ('-' == op) mpz_sub(n,prior_n,n); } int eval_Phi (mpz_t b, mpz_t n, int ParamCnt) { int factors[200]; unsigned dwFactors=0, dw; int B; double p; mpz_t D, T, org_n; if (ParamCnt == 0) { fprintf (stderr, "\nParsing Error - the Phi function (in ECM) requires 2 parameters\n"); return 0; } if (mpz_cmp_ui(n, 1) == 0) { /* return value is 1 if b is composite, or b if b is prime */ int isPrime = mpz_probab_prime_p (b, PROBAB_PRIME_TESTS); if (isPrime) mpz_set(n, b); else mpz_set(n, mpOne); return 1; } if (mpz_cmp_si(n, -1) == 0) { /* this is actually INVALID, but it is easier to simply */ fprintf (stderr, "\nParsing Error - Invalid parameter passed to the Phi function\n"); return 0; } /* OK parse the Phi out now */ if (mpz_cmp_ui(b, 0) == 0) { /* this is valid, but return that it is NOT */ mpz_set(n, mpOne); return 0; } if (mpz_cmp_ui(b, 1) == 0) { if (mpz_cmp_ui(n, 1) != 0) mpz_sub_ui(n, n, 1); return 1; } /* Ok, do the real h_primative work, since we are not one of the trivial case */ B = mpz_get_si(b); if (mpz_cmp_ui(b, B)) { fprintf (stderr, "\nParsing Error - Invalid parameter passed to the Phi function (first param B too high)\n"); return 0; } /* Obtain the factors of B */ getprime_clear (); /* free the prime tables, and reinitialize */ for (p = 2.0; p <= B; p = getprime ()) { if (B % (int) p == 0) { /* Add the factor one time */ factors[dwFactors++] = (int) p; /* but be sure to totally remove it */ do { B /= (int) p; } while (B % (int) p == 0); } } B = mpz_get_si(b); mpz_init_set(org_n, n); mpz_set_ui(n, 1); mpz_init_set_ui(D, 1); mpz_init(T); for(dw=0;(dw<(1U<= n) return (ADD * n); d = n - r; e = 2 * r - n; c = DUP + ADD; /* initial duplicate and final addition */ while (d != e) { if (d < e) { r = d; d = e; e = r; } if (d - e <= e / 4 && ((d + e) % 3) == 0) { /* condition 1 */ d = (2 * d - e) / 3; e = (e - d) / 2; c += 3 * ADD; /* 3 additions */ } else if (d - e <= e / 4 && (d - e) % 6 == 0) { /* condition 2 */ d = (d - e) / 2; c += ADD + DUP; /* one addition, one duplicate */ } else if ((d + 3) / 4 <= e) { /* condition 3 */ d -= e; c += ADD; /* one addition */ } else if ((d + e) % 2 == 0) { /* condition 4 */ d = (d - e) / 2; c += ADD + DUP; /* one addition, one duplicate */ } /* now d+e is odd */ else if (d % 2 == 0) { /* condition 5 */ d /= 2; c += ADD + DUP; /* one addition, one duplicate */ } /* now d is odd and e even */ else if (d % 3 == 0) { /* condition 6 */ d = d / 3 - e; c += 3 * ADD + DUP; /* three additions, one duplicate */ } else if ((d + e) % 3 == 0) { /* condition 7 */ d = (d - 2 * e) / 3; c += 3 * ADD + DUP; /* three additions, one duplicate */ } else if ((d - e) % 3 == 0) { /* condition 8 */ d = (d - e) / 3; c += 3 * ADD + DUP; /* three additions, one duplicate */ } else /* necessarily e is even */ { /* condition 9 */ e /= 2; c += ADD + DUP; /* one addition, one duplicate */ } } return c; } #define NV 4 /* #define SWAP(x,y) { __mpz_struct *tmp = x; x = y; y = tmp; } */ #define SWAP mpres_swap /* computes V_k(P) from P=A and puts the result in P=A. Assumes k>2. Uses auxiliary variables t, B, C, T, T2. */ void pp1_mul_prac (mpres_t A, ecm_uint k, mpmod_t n, mpres_t t, mpres_t B, mpres_t C, mpres_t T, mpres_t T2) { ecm_uint d, e, r, i = 0; static double val[NV] = {0.61803398874989485, 0.5801787282954641, 0.6179144065288179 , 0.6180796684698958}; /* 1/GR, 5/(GR+7) (2), 1429/(GR+2311) (8), 3739/(6051-GR) (9) */ /* chooses the best value of v */ for (d = 0, r = ADD * k; d < NV; d++) { e = lucas_cost_pp1 (k, val[d]); if (e < r) { r = e; i = d; } } d = k; r = (ecm_uint) ((double) d * val[i] + 0.5); /* first iteration always begins by Condition 3, then a swap */ d = k - r; e = 2 * r - k; mpres_set (B, A, n); /* B=A */ mpres_set (C, A, n); /* C=A */ pp1_duplicate (A, A, n); /* A = 2*A */ while (d != e) { if (d < e) { r = d; d = e; e = r; mpres_swap (A, B, n); } /* do the first line of Table 4 whose condition qualifies */ if (d - e <= e / 4 && ((d + e) % 3) == 0) { /* condition 1 */ d = (2 * d - e) / 3; e = (e - d) / 2; pp1_add3 (T, A, B, C, n, t); /* T = f(A,B,C) */ pp1_add3 (T2, T, A, B, n, t); /* T2 = f(T,A,B) */ pp1_add3 (B, B, T, A, n, t); /* B = f(B,T,A) */ mpres_swap (A, T2, n); /* swap A and T2 */ } else if (d - e <= e / 4 && (d - e) % 6 == 0) { /* condition 2 */ d = (d - e) / 2; pp1_add3 (B, A, B, C, n, t); /* B = f(A,B,C) */ pp1_duplicate (A, A, n); /* A = 2*A */ } else if ((d + 3) / 4 <= e) /* <==> (d <= 4 * e) */ { /* condition 3 */ d -= e; pp1_add3 (C, B, A, C, n, t); /* C = f(B,A,C) */ SWAP (B, C, n); } else if ((d + e) % 2 == 0) { /* condition 4 */ d = (d - e) / 2; pp1_add3 (B, B, A, C, n, t); /* B = f(B,A,C) */ pp1_duplicate (A, A, n); /* A = 2*A */ } /* d+e is now odd */ else if (d % 2 == 0) { /* condition 5 */ d /= 2; pp1_add3 (C, C, A, B, n, t); /* C = f(C,A,B) */ pp1_duplicate (A, A, n); /* A = 2*A */ } /* d is odd, e even */ else if (d % 3 == 0) { /* condition 6 */ d = d / 3 - e; pp1_duplicate (T, A, n); /* T = 2*A */ pp1_add3 (T2, A, B, C, n, t); /* T2 = f(A,B,C) */ pp1_add3 (A, T, A, A, n, t); /* A = f(T,A,A) */ pp1_add3 (C, T, T2, C, n, t); /* C = f(T,T2,C) */ SWAP (B, C, n); } else if ((d + e) % 3 == 0) /* d+e <= val[i]*k < k < 2^32 */ { /* condition 7 */ d = (d - 2 * e) / 3; pp1_add3 (T, A, B, C, n, t); /* T1 = f(A,B,C) */ pp1_add3 (B, T, A, B, n, t); /* B = f(T1,A,B) */ pp1_duplicate (T, A, n); pp1_add3 (A, A, T, A, n, t); /* A = 3*A */ } else if ((d - e) % 3 == 0) { /* condition 8: never happens? */ d = (d - e) / 3; pp1_add3 (T, A, B, C, n, t); /* T1 = f(A,B,C) */ pp1_add3 (C, C, A, B, n, t); /* C = f(A,C,B) */ SWAP (B, T, n); /* swap B and T */ pp1_duplicate (T, A, n); pp1_add3 (A, A, T, A, n, t); /* A = 3*A */ } else /* necessarily e is even */ { /* condition 9: never happens? */ e /= 2; pp1_add3 (C, C, B, A, n, t); /* C = f(C,B,A) */ pp1_duplicate (B, B, n); /* B = 2*B */ } } pp1_add3 (A, A, B, C, n, t); ASSERT(d == 1); } ecm-6.4.4/ecm-params.h.alpha-ev60000644023561000001540000000071112106741273013175 00000000000000#define MPZMOD_THRESHOLD 235 #define REDC_THRESHOLD 424 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define MUL_NTT_THRESHOLD 2048 #define PREREVERTDIVISION_NTT_THRESHOLD 1024 #define POLYINVERT_NTT_THRESHOLD 2048 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 256 ecm-6.4.4/athlon/0000755023561000001540000000000012113421640010552 500000000000000ecm-6.4.4/athlon/autogen.py0000755023561000001540000001675712106741265012543 00000000000000#!/usr/bin/python import re import sys def offaddr(addr, offset): if offset == 0: return "("+addr+")" else: return str(offset)+"("+addr+")" # Generate asm for addmul1_k # src and dst are pointers (stored in regs) + offsets # multiplier is in a register # rax, rbx, rcx, rdx are free for use. def addmul1_k(src, off_src, dst, off_dst, mult, k): init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %eax, %ebx, %ecx, %edx\n" init = init + "### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx\n" init = init + " movl " + offaddr(src, off_src) + ", %eax\n" init = init + " mull " + mult + "\n" init = init + " movl %eax, %ebx\n" init = init + " movl %edx, %ecx\n" init = init + " movl " + offaddr(src, off_src+4) + ", %eax\n" block = """ mull __mult__ addl __cylo__, __zi__ movl $0, __cylo__ adcl %eax, __cyhi__ movl __xi2__, %eax adcl %edx, __cylo__ """ code = init cylo = "%ebx" cyhi = "%ecx" for i in range(0,k-2): blocki = re.sub('__cylo__', cylo, block) blocki = re.sub('__cyhi__', cyhi, blocki) blocki = re.sub('__xi2__', offaddr(src, off_src+(i+2)*4), blocki) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*4), blocki) blocki = re.sub('__mult__', mult, blocki) code = code + blocki tmp = cylo cylo = cyhi cyhi = tmp final = " mull " + mult + "\n" final = final + " addl " + cylo + ", " + offaddr(dst, off_dst+(k-2)*4) + "\n" final = final + " adcl " + cyhi + ", %eax\n" final = final + " adcl $0, %edx\n" final = final + " addl %eax, " + offaddr(dst, off_dst+4*(k-1)) + "\n" final = final + " adcl $0, %edx\n" final = final + "### carry limb is in %edx\n" code = code + final return code, "%edx" ### Try mmx/sse2 addmul_1, copying the one of GMP for Pentium4 def addmul1_k_var(src, off_src, dst, off_dst, mult, k): init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %eax, %edx and mmx regs \n" init = init + "### dst[0,k[ += mult*src[0,k[ plus carry put in ecx\n" init = init + " pxor %mm0, %mm0\n" init = init + " movd " + mult + ", %mm7\n" block = """ movd __xi__, %mm1 movd __zi__, %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, __zi__ psrlq $32, %mm0 """ code = init for i in range(0,k): blocki = re.sub('__xi__', offaddr(src, off_src+i*4), block) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*4), blocki) code = code + blocki final = " movd %mm0, %ecx\n" final = final + "### carry limb is in %ecx\n" code = code + final return code, "%ecx" def mulredc_k_rolled(k): header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc__k TYPE(GSYM_PREFIX`'mulredc__k,`function') GSYM_PREFIX`'mulredc__k: """ init = re.sub("__k", str(k), header) INV_M = offaddr("%esp", 4*(2*k+1) + 40) ADDR_M = offaddr("%esp", 4*(2*k+1) + 36) ADDR_Y = offaddr("%esp", 4*(2*k+1) + 32) ADDR_X = offaddr("%esp", 4*(2*k+1) + 28) ADDR_Z = offaddr("%esp", 4*(2*k+1) + 24) init = init + """ pushl %ebp pushl %edi pushl %esi pushl %ebx """ init = init + " subl $" + str(4*(2*k+2)) + ", %esp\n" init = init + " movl %esp, %edi\n" init = init + "### set tmp[0..2k+1[ to 0\n" for i in range(0,2*k+1): init = init + " movl $0, " + offaddr("%edi", 4*i) + "\n" code = init middle_code = "###########################################\n" middle_code = middle_code + " movl $" + str(k) + ", " + offaddr("%esp", 4*(2*k+1)) + "\n" middle_code = middle_code + """ .align 32 Loop: ## compute u and store in %ebp """ middle_code = middle_code + " movl " + ADDR_X + ", %eax\n" middle_code = middle_code + " movl " + ADDR_Y + ", %esi\n" middle_code = middle_code + """ movl (%eax), %eax mull (%esi) addl (%edi), %eax """ middle_code = middle_code + " mull " + INV_M + "\n" middle_code = middle_code + " movl %eax, %ebp\n" middle_code = middle_code + " movl " + ADDR_M + ", %esi\n" codeaddmul, carry = addmul1_k("%esi", 0, "%edi", 0, "%ebp", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addl " + carry + ", " + offaddr("%edi", 4*k) + "\n" middle_code = middle_code + " adcl $0, " + offaddr("%edi", 4*(k+1)) + "\n" middle_code = middle_code + " movl " + ADDR_X + ", %eax\n" middle_code = middle_code + " movl (%eax), %ebp\n" middle_code = middle_code + " movl " + ADDR_Y + ", %esi\n" codeaddmul, carry = addmul1_k("%esi", 0, "%edi", 0, "%ebp", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addl " + carry + ", " + offaddr("%edi", 4*k) + "\n" middle_code = middle_code + " adcl $0, " + offaddr("%edi", 4*(k+1)) + "\n\n" middle_code = middle_code + " addl $4, " + ADDR_X + "\n addl $4, %edi\n" middle_code = middle_code + " decl " + offaddr("%esp", 4*(2*k+1)) + "\n jnz Loop\n" code = code + middle_code final = "###########################################\n" final = final + "### Copy result in z\n" final = final + " movl " + ADDR_Z + ", %ebx\n" for i in range(0,k): final = final + " movl " + offaddr("%edi", 4*i) + ", %eax\n" final = final + " movl %eax, " + offaddr("%ebx", 4*i) + "\n" final = final + " movl " + offaddr("%edi", 4*k) + ", %eax # carry\n" final = final + " addl $" + str(4*(2*k+2)) + ", %esp\n" final = final + " popl %ebx\n" final = final + " popl %esi\n" final = final + " popl %edi\n" final = final + " popl %ebp\n" # final = final + " emms\n" final = final + " ret\n" code = code + final return code k = int(sys.argv[1]) if k == 1: print """# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') GSYM_PREFIX`'mulredc1: # Stack: # inv_m 20(%esp) # m 16 # y 12(%esp) # x 8 # z 4(%esp) movl 12(%esp), %eax mull 8(%esp) movl %edx, 12(%esp) movl %eax, 8(%esp) # store xy in [8(%esp):12(%esp)] mull 20(%esp) # compute u mull 16(%esp) # compute u*m addl 8(%esp), %eax # eax is 0, now (carry is important) adcl 12(%esp), %edx movl 4(%esp), %ecx movl %edx, (%ecx) adcl $0, %eax ret """ else: print mulredc_k_rolled(k) ecm-6.4.4/athlon/mulredc3.asm0000644023561000001540000000507212106741265012730 00000000000000# mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc3 TYPE(GSYM_PREFIX`'mulredc3,`function') GSYM_PREFIX`'mulredc3: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $32, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) ########################################### movl $3, 28(%esp) .align 32 Loop: ## compute u and store in %ebp movl 56(%esp), %eax movl 60(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 68(%esp) movl %eax, %ebp movl 64(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 3 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 8(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 12(%edi) adcl $0, 16(%edi) movl 56(%esp), %eax movl (%eax), %ebp movl 60(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 3 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 8(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 12(%edi) adcl $0, 16(%edi) addl $4, 56(%esp) addl $4, %edi decl 28(%esp) jnz Loop ########################################### ### Copy result in z movl 52(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax # carry addl $32, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc.h0000644023561000001540000000462512106741265012317 00000000000000#ifndef __ASM_REDC_H__ #define __ASM_REDC_H__ #include /* Signals that we have assembly code for variable size redc */ #define HAVE_ASM_REDC3 extern void ecm_redc3(mp_limb_t *, const mp_limb_t *, mp_size_t, mp_limb_t); /* WARNING: the size-1 version doesn't take pointers in input */ extern mp_limb_t mulredc1(mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); extern mp_limb_t mulredc2(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc3(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc4(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc5(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc6(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc7(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc8(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc9(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc10(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc11(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc12(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc13(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc14(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc15(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc16(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc17(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc18(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc19(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc20(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); #endif ecm-6.4.4/athlon/mulredc14.asm0000644023561000001540000001323312106741265013010 00000000000000# mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc14 TYPE(GSYM_PREFIX`'mulredc14,`function') GSYM_PREFIX`'mulredc14: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $120, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) ########################################### movl $14, 116(%esp) .align 32 Loop: ## compute u and store in %ebp movl 144(%esp), %eax movl 148(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 156(%esp) movl %eax, %ebp movl 152(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 14 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 52(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 56(%edi) adcl $0, 60(%edi) movl 144(%esp), %eax movl (%eax), %ebp movl 148(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 14 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 52(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 56(%edi) adcl $0, 60(%edi) addl $4, 144(%esp) addl $4, %edi decl 116(%esp) jnz Loop ########################################### ### Copy result in z movl 140(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax # carry addl $120, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc5.asm0000644023561000001540000000616612106741265012737 00000000000000# mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc5 TYPE(GSYM_PREFIX`'mulredc5,`function') GSYM_PREFIX`'mulredc5: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $48, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) ########################################### movl $5, 44(%esp) .align 32 Loop: ## compute u and store in %ebp movl 72(%esp), %eax movl 76(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 84(%esp) movl %eax, %ebp movl 80(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 5 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 16(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 20(%edi) adcl $0, 24(%edi) movl 72(%esp), %eax movl (%eax), %ebp movl 76(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 5 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 16(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 20(%edi) adcl $0, 24(%edi) addl $4, 72(%esp) addl $4, %edi decl 44(%esp) jnz Loop ########################################### ### Copy result in z movl 68(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax # carry addl $48, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/Makefile.dev0000644023561000001540000000160412106741265012722 00000000000000.PHONY: all all: test_mulredc bench CFLAGS:=-g -O2 -funroll-loops ALLMULRED:= mulredc1.o mulredc2.o mulredc3.o mulredc4.o mulredc5.o\ mulredc6.o mulredc7.o mulredc8.o mulredc9.o mulredc10.o\ mulredc11.o mulredc12.o mulredc13.o mulredc14.o\ mulredc15.o mulredc16.o mulredc17.o mulredc18.o\ mulredc19.o mulredc20.o redc.s: redc.asm m4 redc.asm > redc.s redc.o: redc.s gcc -c $(CFLAGS) redc.s -o redc.o mulredc%.o: mulredc%.asm m4 $< > tmp-mulred.s gcc -c $(CFLAGS) tmp-mulred.s -o $@ rm tmp-mulred.s mulredc%.asm: ./autogen.py ./autogen.py $* > $@ test_mulredc: test_mulredc.c redc.o $(ALLMULRED) gcc -o test_mulredc $(CFLAGS) test_mulredc.c $(ALLMULRED) redc.o -lgmp bench: bench.c redc.o $(ALLMULRED) gcc -o bench $(CFLAGS) bench.c $(ALLMULRED) redc.o -lgmp clean: rm redc.s *.o mulredc[0-9]*.s mulredc[0-9]*.asm test_mulredc ecm-6.4.4/athlon/generate_all0000755023561000001540000000016312106741265013054 00000000000000#!/bin/sh for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do ./autogen.py $i > mulredc$i.asm done ecm-6.4.4/athlon/mulredc6.asm0000644023561000001540000000662412106741265012737 00000000000000# mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc6 TYPE(GSYM_PREFIX`'mulredc6,`function') GSYM_PREFIX`'mulredc6: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $56, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) ########################################### movl $6, 52(%esp) .align 32 Loop: ## compute u and store in %ebp movl 80(%esp), %eax movl 84(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 92(%esp) movl %eax, %ebp movl 88(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 6 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 20(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 24(%edi) adcl $0, 28(%edi) movl 80(%esp), %eax movl (%eax), %ebp movl 84(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 6 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 20(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 24(%edi) adcl $0, 28(%edi) addl $4, 80(%esp) addl $4, %edi decl 52(%esp) jnz Loop ########################################### ### Copy result in z movl 76(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax # carry addl $56, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc2.asm0000644023561000001540000000443712106741265012733 00000000000000# mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc2 TYPE(GSYM_PREFIX`'mulredc2,`function') GSYM_PREFIX`'mulredc2: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $24, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) ########################################### movl $2, 20(%esp) .align 32 Loop: ## compute u and store in %ebp movl 48(%esp), %eax movl 52(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 60(%esp) movl %eax, %ebp movl 56(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 2 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 4(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 8(%edi) adcl $0, 12(%edi) movl 48(%esp), %eax movl (%eax), %ebp movl 52(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 2 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 4(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 8(%edi) adcl $0, 12(%edi) addl $4, 48(%esp) addl $4, %edi decl 20(%esp) jnz Loop ########################################### ### Copy result in z movl 44(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax # carry addl $24, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc20.asm0000644023561000001540000001653312106741265013013 00000000000000# mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc20 TYPE(GSYM_PREFIX`'mulredc20,`function') GSYM_PREFIX`'mulredc20: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $168, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) movl $0, 148(%edi) movl $0, 152(%edi) movl $0, 156(%edi) movl $0, 160(%edi) ########################################### movl $20, 164(%esp) .align 32 Loop: ## compute u and store in %ebp movl 192(%esp), %eax movl 196(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 204(%esp) movl %eax, %ebp movl 200(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 20 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) movl $0, %ebx adcl %eax, %ecx movl 72(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 68(%edi) movl $0, %ecx adcl %eax, %ebx movl 76(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 72(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 76(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 80(%edi) adcl $0, 84(%edi) movl 192(%esp), %eax movl (%eax), %ebp movl 196(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 20 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) movl $0, %ebx adcl %eax, %ecx movl 72(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 68(%edi) movl $0, %ecx adcl %eax, %ebx movl 76(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 72(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 76(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 80(%edi) adcl $0, 84(%edi) addl $4, 192(%esp) addl $4, %edi decl 164(%esp) jnz Loop ########################################### ### Copy result in z movl 188(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax movl %eax, 72(%ebx) movl 76(%edi), %eax movl %eax, 76(%ebx) movl 80(%edi), %eax # carry addl $168, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc17.asm0000644023561000001540000001477312106741265013025 00000000000000# mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc17 TYPE(GSYM_PREFIX`'mulredc17,`function') GSYM_PREFIX`'mulredc17: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $144, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) ########################################### movl $17, 140(%esp) .align 32 Loop: ## compute u and store in %ebp movl 168(%esp), %eax movl 172(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 180(%esp) movl %eax, %ebp movl 176(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 17 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 64(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 68(%edi) adcl $0, 72(%edi) movl 168(%esp), %eax movl (%eax), %ebp movl 172(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 17 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 64(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 68(%edi) adcl $0, 72(%edi) addl $4, 168(%esp) addl $4, %edi decl 140(%esp) jnz Loop ########################################### ### Copy result in z movl 164(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax # carry addl $144, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/redc.asm0000644023561000001540000001600512106741265012125 00000000000000dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc. dnl dnl This file is a modified part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'ecm_redc3 TYPE(GSYM_PREFIX`'ecm_redc3,`function') GSYM_PREFIX`'ecm_redc3: push %ebp # Push registers push %edi push %esi push %ebx subl $16, %esp # SF: 2 Cpt + Jump +1 movl 44(%esp), %ecx # Read size movl 36(%esp), %edi # Read Dest Ptr movl %ecx, (%esp) # Save counter cmpl $5, %ecx jae Unroll Loop: movl 48(%esp), %ebp # Read invm movl 40(%esp), %esi # Read Source Ptr imull (%edi), %ebp # Dest[0] * invm movl %edi, 36(%esp) # Save new Dest movl 44(%esp), %ecx # Read Size (2) xorl %ebx, %ebx # Initial Carry InnerLoop: # esi: Source # edi: Dest # ebp: Multiplier # ecx: Counter movl (%esi), %eax # U1 addl $4, %edi # V1 mull %ebp # U2 addl $4, %esi # V2 addl %ebx, %eax # U3 adcl $0, %edx # U4 addl %eax, -4(%edi) # V4 adcl $0, %edx # U5 decl %ecx # V5 movl %edx, %ebx # U6 jnz InnerLoop # V6 movl 36(%esp), %edi movl %ebx, (%edi) # Save final carry decl (%esp) lea 4(%edi), %edi # Advance Dest jnz Loop # Loop End: addl $16, %esp pop %ebx pop %esi pop %edi pop %ebp ret Unroll: # %ecx Read size // %edi Dest Ptr # Precalcul du saut movl %ecx, %edx decl %ecx subl $2, %edx negl %ecx shrl $4, %edx andl $15, %ecx movl %edx, 8(%esp) # Org Cpt of 4(%esp) movl %ecx, %edx shll $4, %edx negl %ecx leal UnrollEntry (%edx, %ecx,1), %edx movl %ecx, 44(%esp) # (-size)%16 movl %edx, 12(%esp) # Org PC inside UnrollLoop: movl 48(%esp), %ebp # Read invm movl 40(%esp), %esi # Read Source Ptr imull (%edi), %ebp # Dest[0] * invm movl %edi, 36(%esp) # Save new Dest movl 44(%esp), %ecx # Read Size %16 movl 8(%esp), %edx # Read InnerLoop Cpt movl %edx, 4(%esp) # Set InnerLoop Cpt # First mull and set initial carry movl (%esi), %eax leal 4(%esi,%ecx,4), %esi mull %ebp leal (%edi,%ecx,4), %edi movl %edx, %ebx # Do the Jump inside the unrolling loop # And set up the registers differently if odd movl 12(%esp), %edx testl $1, %ecx movl %eax, %ecx cmovnz %ebx, %ecx cmovnz %eax, %ebx jmp *%edx # eax scratch # ebx carry hi # ecx carry lo # edx scratch # esi src # edi dst # ebp multiplier .align 32, 0x90 UnrollInnerLoop: addl $64, %edi UnrollEntry: # movl 0(%esi), %eax # Can't use this instruction .byte 0x8b,0x46,0x00 mull %ebp # addl %ecx, 0(%edi) # Can't use this instruction .byte 0x01,0x4f,0x00 adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, 4(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 8(%esi), %eax mull %ebp addl %ecx, 8(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 12(%esi), %eax mull %ebp addl %ebx, 12(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 16(%esi), %eax mull %ebp addl %ecx, 16(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 20(%esi), %eax mull %ebp addl %ebx, 20(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 24(%esi), %eax mull %ebp addl %ecx, 24(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 28(%esi), %eax mull %ebp addl %ebx, 28(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 32(%esi), %eax mull %ebp addl %ecx, 32(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 36(%esi), %eax mull %ebp addl %ebx, 36(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 40(%esi), %eax mull %ebp addl %ecx, 40(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 44(%esi), %eax mull %ebp addl %ebx, 44(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 48(%esi), %eax mull %ebp addl %ecx, 48(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 52(%esi), %eax mull %ebp addl %ebx, 52(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 56(%esi), %eax mull %ebp addl %ecx, 56(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 60(%esi), %eax mull %ebp addl %ebx, 60(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx decl 4(%esp) leal 64(%esi), %esi jns UnrollInnerLoop addl %ecx, 64(%edi) movl 36(%esp), %edi adcl $0, %ebx movl %ebx, (%edi) # Save final carry decl (%esp) lea 4(%edi), %edi # Advance Dest jnz UnrollLoop # Loop End2: addl $16, %esp pop %ebx pop %esi pop %edi pop %ebp ret ecm-6.4.4/athlon/mulredc18.asm0000644023561000001540000001543312106741265013020 00000000000000# mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc18 TYPE(GSYM_PREFIX`'mulredc18,`function') GSYM_PREFIX`'mulredc18: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $152, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) ########################################### movl $18, 148(%esp) .align 32 Loop: ## compute u and store in %ebp movl 176(%esp), %eax movl 180(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 188(%esp) movl %eax, %ebp movl 184(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 18 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 68(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 72(%edi) adcl $0, 76(%edi) movl 176(%esp), %eax movl (%eax), %ebp movl 180(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 18 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 68(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 72(%edi) adcl $0, 76(%edi) addl $4, 176(%esp) addl $4, %edi decl 148(%esp) jnz Loop ########################################### ### Copy result in z movl 172(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax # carry addl $152, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc19.asm0000644023561000001540000001607312106741265013022 00000000000000# mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc19 TYPE(GSYM_PREFIX`'mulredc19,`function') GSYM_PREFIX`'mulredc19: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $160, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) movl $0, 148(%edi) movl $0, 152(%edi) ########################################### movl $19, 156(%esp) .align 32 Loop: ## compute u and store in %ebp movl 184(%esp), %eax movl 188(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 196(%esp) movl %eax, %ebp movl 192(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 19 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) movl $0, %ebx adcl %eax, %ecx movl 72(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 68(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 72(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 76(%edi) adcl $0, 80(%edi) movl 184(%esp), %eax movl (%eax), %ebp movl 188(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 19 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) movl $0, %ebx adcl %eax, %ecx movl 64(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 60(%edi) movl $0, %ecx adcl %eax, %ebx movl 68(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 64(%edi) movl $0, %ebx adcl %eax, %ecx movl 72(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 68(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 72(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 76(%edi) adcl $0, 80(%edi) addl $4, 184(%esp) addl $4, %edi decl 156(%esp) jnz Loop ########################################### ### Copy result in z movl 180(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax movl %eax, 72(%ebx) movl 76(%edi), %eax # carry addl $160, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc9.asm0000644023561000001540000001036612106741265012740 00000000000000# mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc9 TYPE(GSYM_PREFIX`'mulredc9,`function') GSYM_PREFIX`'mulredc9: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $80, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) ########################################### movl $9, 76(%esp) .align 32 Loop: ## compute u and store in %ebp movl 104(%esp), %eax movl 108(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 116(%esp) movl %eax, %ebp movl 112(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 9 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 32(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 36(%edi) adcl $0, 40(%edi) movl 104(%esp), %eax movl (%eax), %ebp movl 108(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 9 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 32(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 36(%edi) adcl $0, 40(%edi) addl $4, 104(%esp) addl $4, %edi decl 76(%esp) jnz Loop ########################################### ### Copy result in z movl 100(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax # carry addl $80, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc13.asm0000644023561000001540000001257312106741265013015 00000000000000# mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc13 TYPE(GSYM_PREFIX`'mulredc13,`function') GSYM_PREFIX`'mulredc13: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $112, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) ########################################### movl $13, 108(%esp) .align 32 Loop: ## compute u and store in %ebp movl 136(%esp), %eax movl 140(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 148(%esp) movl %eax, %ebp movl 144(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 13 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 48(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 52(%edi) adcl $0, 56(%edi) movl 136(%esp), %eax movl (%eax), %ebp movl 140(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 13 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 48(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 52(%edi) adcl $0, 56(%edi) addl $4, 136(%esp) addl $4, %edi decl 108(%esp) jnz Loop ########################################### ### Copy result in z movl 132(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax # carry addl $112, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc12.asm0000644023561000001540000001213312106741265013004 00000000000000# mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc12 TYPE(GSYM_PREFIX`'mulredc12,`function') GSYM_PREFIX`'mulredc12: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $104, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) ########################################### movl $12, 100(%esp) .align 32 Loop: ## compute u and store in %ebp movl 128(%esp), %eax movl 132(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 140(%esp) movl %eax, %ebp movl 136(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 12 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 44(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 48(%edi) adcl $0, 52(%edi) movl 128(%esp), %eax movl (%eax), %ebp movl 132(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 12 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 44(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 48(%edi) adcl $0, 52(%edi) addl $4, 128(%esp) addl $4, %edi decl 100(%esp) jnz Loop ########################################### ### Copy result in z movl 124(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax # carry addl $104, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc16.asm0000644023561000001540000001433312106741265013014 00000000000000# mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc16 TYPE(GSYM_PREFIX`'mulredc16,`function') GSYM_PREFIX`'mulredc16: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $136, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) ########################################### movl $16, 132(%esp) .align 32 Loop: ## compute u and store in %ebp movl 160(%esp), %eax movl 164(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 172(%esp) movl %eax, %ebp movl 168(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 16 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 60(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 64(%edi) adcl $0, 68(%edi) movl 160(%esp), %eax movl (%eax), %ebp movl 164(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 16 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) movl $0, %ecx adcl %eax, %ebx movl 60(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 56(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 60(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 64(%edi) adcl $0, 68(%edi) addl $4, 160(%esp) addl $4, %edi decl 132(%esp) jnz Loop ########################################### ### Copy result in z movl 156(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax # carry addl $136, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc1.asm0000644023561000001540000000243612106741265012727 00000000000000# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') GSYM_PREFIX`'mulredc1: # Stack: # inv_m 20(%esp) # m 16 # y 12(%esp) # x 8 # z 4(%esp) movl 12(%esp), %eax mull 8(%esp) movl %edx, 12(%esp) movl %eax, 8(%esp) # store xy in [8(%esp):12(%esp)] mull 20(%esp) # compute u mull 16(%esp) # compute u*m addl 8(%esp), %eax # eax is 0, now (carry is important) adcl 12(%esp), %edx movl 4(%esp), %ecx movl %edx, (%ecx) adcl $0, %eax ret ecm-6.4.4/athlon/mulredc15.asm0000644023561000001540000001367312106741265013021 00000000000000# mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc15 TYPE(GSYM_PREFIX`'mulredc15,`function') GSYM_PREFIX`'mulredc15: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $128, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) ########################################### movl $15, 124(%esp) .align 32 Loop: ## compute u and store in %ebp movl 152(%esp), %eax movl 156(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 164(%esp) movl %eax, %ebp movl 160(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 15 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 56(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 60(%edi) adcl $0, 64(%edi) movl 152(%esp), %eax movl (%eax), %ebp movl 156(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 15 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) movl $0, %ecx adcl %eax, %ebx movl 44(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 40(%edi) movl $0, %ebx adcl %eax, %ecx movl 48(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 44(%edi) movl $0, %ecx adcl %eax, %ebx movl 52(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 48(%edi) movl $0, %ebx adcl %eax, %ecx movl 56(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 52(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 56(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 60(%edi) adcl $0, 64(%edi) addl $4, 152(%esp) addl $4, %edi decl 124(%esp) jnz Loop ########################################### ### Copy result in z movl 148(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax # carry addl $128, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc10.asm0000644023561000001540000001103312106741265013000 00000000000000# mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc10 TYPE(GSYM_PREFIX`'mulredc10,`function') GSYM_PREFIX`'mulredc10: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $88, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) ########################################### movl $10, 84(%esp) .align 32 Loop: ## compute u and store in %ebp movl 112(%esp), %eax movl 116(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 124(%esp) movl %eax, %ebp movl 120(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 10 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 36(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 40(%edi) adcl $0, 44(%edi) movl 112(%esp), %eax movl (%eax), %ebp movl 116(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 10 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 36(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 40(%edi) adcl $0, 44(%edi) addl $4, 112(%esp) addl $4, %edi decl 84(%esp) jnz Loop ########################################### ### Copy result in z movl 108(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax # carry addl $88, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/Makefile.am0000644023561000001540000000224512106741265012543 00000000000000MULREDC = mulredc1.asm mulredc2.asm mulredc3.asm mulredc4.asm mulredc5.asm \ mulredc6.asm mulredc7.asm mulredc8.asm mulredc9.asm mulredc10.asm \ mulredc11.asm mulredc12.asm mulredc13.asm mulredc14.asm \ mulredc15.asm mulredc16.asm mulredc17.asm mulredc18.asm \ mulredc19.asm mulredc20.asm EXTRA_DIST = Makefile.dev README autogen.py generate_all noinst_LTLIBRARIES = libmulredc.la # This library definition also causes the mulredc[n].asm and redc.asm files # to go in the distribution - no need for having them in EXTRA_DIST libmulredc_la_SOURCES = $(MULREDC) redc.asm noinst_HEADERS = mulredc.h # The asm code does not depend on any libraries except libc for abort() # if assertions are enabled LIBS = LDFLAGS = # It's actually the .s files that depend on config.m4, but automake # knows them only as intermediate files, not as targets. Adding the # dependency to libmulredc.la should work so long as no stale .s # files exist. libmulredc_la_DEPENDENCIES = $(top_builddir)/config.m4 .asm.s: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.s .asm.S: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.S ecm-6.4.4/athlon/mulredc4.asm0000644023561000001540000000553012106741265012730 00000000000000# mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc4 TYPE(GSYM_PREFIX`'mulredc4,`function') GSYM_PREFIX`'mulredc4: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $40, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) ########################################### movl $4, 36(%esp) .align 32 Loop: ## compute u and store in %ebp movl 64(%esp), %eax movl 68(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 76(%esp) movl %eax, %ebp movl 72(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 4 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 12(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 16(%edi) adcl $0, 20(%edi) movl 64(%esp), %eax movl (%eax), %ebp movl 68(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 4 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 12(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 16(%edi) adcl $0, 20(%edi) addl $4, 64(%esp) addl $4, %edi decl 36(%esp) jnz Loop ########################################### ### Copy result in z movl 60(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax # carry addl $40, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc8.asm0000644023561000001540000000772412106741265012743 00000000000000# mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc8 TYPE(GSYM_PREFIX`'mulredc8,`function') GSYM_PREFIX`'mulredc8: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $72, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) ########################################### movl $8, 68(%esp) .align 32 Loop: ## compute u and store in %ebp movl 96(%esp), %eax movl 100(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 108(%esp) movl %eax, %ebp movl 104(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 8 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 28(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 32(%edi) adcl $0, 36(%edi) movl 96(%esp), %eax movl (%eax), %ebp movl 100(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 8 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) adcl %ecx, %eax adcl $0, %edx addl %eax, 28(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 32(%edi) adcl $0, 36(%edi) addl $4, 96(%esp) addl $4, %edi decl 68(%esp) jnz Loop ########################################### ### Copy result in z movl 92(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax # carry addl $72, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/README0000644023561000001540000000121012106741265011356 00000000000000mulredc[1..20].s are size-specific asm code for mulredc. These are generated by the Python script autogen.py. In order to avoid dependency of the package to Python, this generation is not done automatically with the autoconf/automake stuff. If you need to regenerate them, the syntax is ./autogen.py 3 > mulredc3.s And you can generate all of them with the shell script ./generate_all This asm code uses no MMX/SSE2 instructions and should work on any x86 computers. redc.asm is a version of redc separated from the multiplication, since there are cases where it is needed. test_mulredc.c, bench.c and the Makefile are for developpement. ecm-6.4.4/athlon/mulredc11.asm0000644023561000001540000001147112106741265013007 00000000000000# mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc11 TYPE(GSYM_PREFIX`'mulredc11,`function') GSYM_PREFIX`'mulredc11: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $96, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) ########################################### movl $11, 92(%esp) .align 32 Loop: ## compute u and store in %ebp movl 120(%esp), %eax movl 124(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 132(%esp) movl %eax, %ebp movl 128(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 11 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 40(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 44(%edi) adcl $0, 48(%edi) movl 120(%esp), %eax movl (%eax), %ebp movl 124(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 11 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) movl $0, %ecx adcl %eax, %ebx movl 28(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 24(%edi) movl $0, %ebx adcl %eax, %ecx movl 32(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 28(%edi) movl $0, %ecx adcl %eax, %ebx movl 36(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 32(%edi) movl $0, %ebx adcl %eax, %ecx movl 40(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 36(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 40(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 44(%edi) adcl $0, 48(%edi) addl $4, 120(%esp) addl $4, %edi decl 92(%esp) jnz Loop ########################################### ### Copy result in z movl 116(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax # carry addl $96, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/athlon/mulredc7.asm0000644023561000001540000000726312106741265012740 00000000000000# mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc7 TYPE(GSYM_PREFIX`'mulredc7,`function') GSYM_PREFIX`'mulredc7: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $64, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) ########################################### movl $7, 60(%esp) .align 32 Loop: ## compute u and store in %ebp movl 88(%esp), %eax movl 92(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 100(%esp) movl %eax, %ebp movl 96(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 7 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 24(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 28(%edi) adcl $0, 32(%edi) movl 88(%esp), %eax movl (%eax), %ebp movl 92(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 7 ### kills %eax, %ebx, %ecx, %edx ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx movl (%esi), %eax mull %ebp movl %eax, %ebx movl %edx, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, (%edi) movl $0, %ebx adcl %eax, %ecx movl 8(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 4(%edi) movl $0, %ecx adcl %eax, %ebx movl 12(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 8(%edi) movl $0, %ebx adcl %eax, %ecx movl 16(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 12(%edi) movl $0, %ecx adcl %eax, %ebx movl 20(%esi), %eax adcl %edx, %ecx mull %ebp addl %ebx, 16(%edi) movl $0, %ebx adcl %eax, %ecx movl 24(%esi), %eax adcl %edx, %ebx mull %ebp addl %ecx, 20(%edi) adcl %ebx, %eax adcl $0, %edx addl %eax, 24(%edi) adcl $0, %edx ### carry limb is in %edx addl %edx, 28(%edi) adcl $0, 32(%edi) addl $4, 88(%esp) addl $4, %edi decl 60(%esp) jnz Loop ########################################### ### Copy result in z movl 84(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax # carry addl $64, %esp popl %ebx popl %esi popl %edi popl %ebp ret ecm-6.4.4/mul_fft-params.h.pentium30000644023561000001540000001073112106741273014051 00000000000000#define MUL_FFT_MODF_THRESHOLD 480 #define SQR_FFT_MODF_THRESHOLD 480 #define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {305, 5 /*95*/}, {321, 4 /*97*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {801, 6 /*96*/}, {1281, 7 /*91*/}, {1409, 6 /*97*/}, {1601, 7 /*92*/}, {1921, 6 /*98*/}, {1985, 7 /*94*/}, {2689, 8 /*91*/}, {2817, 7 /*95*/}, {3201, 8 /*92*/}, {3329, 7 /*96*/}, {3457, 8 /*87*/}, {3841, 7 /*96*/}, {3969, 8 /*88*/}, {4865, 7 /*97*/}, {4993, 8 /*90*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*83*/}, {11777, 8 /*97*/}, {12033, 9 /*85*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {15873, 8 /*98*/}, {16129, 9 /*88*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {26113, 10 /*81*/}, {31745, 9 /*98*/}, {34305, 10 /*85*/}, {39937, 9 /*98*/}, {40449, 10 /*83*/}, {48129, 11 /*75*/}, {63489, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {129025, 9 /*98*/}, {130561, 11 /*80*/}, {194561, 12 /*75*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 9 /*99*/}, {278017, 10 /*94*/}, {293889, 9 /*99*/}, {294401, 7 /*99*/}, {294529, 8 /*99*/}, {294657, 10 /*94*/}, {310273, 9 /*99*/}, {310785, 10 /*95*/}, {326657, 12 /*83*/}, {389121, 13 /*75*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {662529, 11 /*96*/}, {686081, 10 /*99*/}, {687105, 9 /*99*/}, {687617, 11 /*95*/}, {718849, 10 /*99*/}, {752641, 9 /*99*/}, {753153, 11 /*95*/}, {784385, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {980993, 10 /*99*/}, {982017, 12 /*93*/}, {LONG_MAX, 0}} #define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {273, 5 /*94*/}, {289, 4 /*97*/}, {305, 5 /*95*/}, {609, 6 /*95*/}, {641, 5 /*97*/}, {673, 6 /*95*/}, {705, 5 /*97*/}, {737, 6 /*96*/}, {1473, 7 /*96*/}, {1537, 6 /*98*/}, {1601, 7 /*96*/}, {1665, 6 /*98*/}, {1729, 7 /*96*/}, {2689, 8 /*91*/}, {2817, 7 /*97*/}, {2945, 8 /*92*/}, {3329, 7 /*98*/}, {3457, 8 /*93*/}, {5377, 9 /*91*/}, {5633, 8 /*95*/}, {6401, 9 /*92*/}, {6657, 8 /*96*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {7937, 9 /*88*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {24065, 10 /*85*/}, {27649, 11 /*87*/}, {30721, 10 /*96*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {39937, 11 /*83*/}, {47105, 10 /*97*/}, {48129, 12 /*75*/}, {61441, 11 /*96*/}, {63489, 10 /*98*/}, {68609, 11 /*85*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 12 /*85*/}, {323585, 10 /*99*/}, {326657, 9 /*99*/}, {327169, 10 /*95*/}, {330753, 12 /*84*/}, {389121, 10 /*99*/}, {392193, 9 /*99*/}, {392705, 10 /*96*/}, {408577, 9 /*99*/}, {409089, 8 /*99*/}, {409345, 10 /*96*/}, {412673, 12 /*90*/}, {454657, 13 /*87*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {555009, 10 /*99*/}, {556033, 9 /*99*/}, {556545, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {654337, 11 /*95*/}, {686081, 13 /*87*/}, {778241, 11 /*99*/}, {817153, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {915457, 12 /*93*/}, {978945, 14 /*93*/}, {LONG_MAX, 0}} #define MUL_FFT_FULL_TABLE2 {{100, 2}, {216, 1}, {256, 2}, {264, 1}, {304, 2}, {312, 1}, {544, 4}, {560, 1}, {704, 2}, {720, 1}, {896, 2}, {960, 7}, {40960, 2}, {47616, 1}, {49152, 6}, {53760, 4}, {56320, 1}, {64512, 4}, {71680, 5}, {86016, 2}, {96768, 4}, {99840, 1}, {131072, 6}, {136192, 7}, {147456, 6}, {150528, 4}, {161280, 1}, {161792, 3}, {172032, 2}, {193536, 1}, {259072, 6}, {286720, 7}, {294912, 6}, {301056, 4}, {322560, 3}, {344064, 2}, {387072, 1}, {393216, 4}, {404480, 3}, {409600, 1}, {417792, 3}, {425984, 1}, {524288, 6}, {530432, 7}, {557056, 6}, {566272, 5}, {577536, 4}, {593920, 6}, {602112, 5}, {614400, 4}, {645120, 3}, {647168, 4}, {652800, 1}, {654336, 6}, {673792, 3}, {688128, 2}, {724992, 4}, {727040, 1}, {753664, 2}, {783360, 4}, {816640, 6}, {831488, 1}, {851968, 2}, {860160, 3}, {868352, 2}, {881664, 7}, {884736, 1}, {921600, 7}, {950272, 1}, {LONG_MAX, 1}} ecm-6.4.4/ecm-params.h.corei50000644023561000001540000000314412106741273012603 00000000000000/* tuned on confit.loria.fr (Intel(R) Core(TM) i5-2500 CPU) */ #ifndef HAVE_MPIR /* tuning parameters for GMP, tuned for GMP 5.0.4 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 13, 13, 13, 14, 14, 15, 16, 16, 17, 20, 22} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 8 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 128 #define MPZSPV_NORMALISE_STRIDE 512 #else /* tuning parameters for MPIR, tuned for MPIR 2.5.1 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,2,2} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,2,2,1,2,2,2,2,2,2,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 12, 13, 14, 15, 14, 16, 18, 18, 20, 18, 20} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 256 #endif ecm-6.4.4/pp1.c0000644023561000001540000007361412106741273010075 00000000000000/* The 'P+1' algorithm. Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Paul Zimmermann and Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ /* References: A p+1 Method of Factoring, H. C. Williams, Mathematics of Computation, volume 39, number 159, pages 225-234, 1982. Evaluating recurrences of form X_{m+n} = f(X_m, X_n, X_{m-n}) via Lucas chains, Peter L. Montgomery, December 1983, revised January 1992. */ #include #include #include "ecm-impl.h" #ifdef HAVE_LIMITS_H # include #else # ifndef ULONG_MAX # define ULONG_MAX __GMP_ULONG_MAX # endif #endif /****************************************************************************** * * * Stage 1 * * * ******************************************************************************/ /* prime powers are accumulated up to about n^L1 */ #define L1 1 /* P1 <- V_e(P0), using P, Q as auxiliary variables, where V_{2k}(P0) = V_k(P0)^2 - 2 V_{2k-1}(P0) = V_k(P0)*V_{k-1}(P0) - P0. (More generally V_{m+n} = V_m * V_n - V_{m-n}.) Warning: P1 and P0 may be equal. */ static void pp1_mul (mpres_t P1, mpres_t P0, mpz_t e, mpmod_t n, mpres_t P, mpres_t Q) { mp_size_t size_e; unsigned long i; int sign; sign = mpz_sgn (e); mpz_abs (e, e); if (sign == 0) { mpres_set_ui (P1, 2, n); goto unnegate; } if (mpz_cmp_ui (e, 1) == 0) { mpres_set (P1, P0, n); goto unnegate; } /* now e >= 2 */ mpz_sub_ui (e, e, 1); mpres_sqr (P, P0, n); mpres_sub_ui (P, P, 2, n); /* P = V_2(P0) = P0^2-2 */ mpres_set (Q, P0, n); /* Q = V_1(P0) = P0 */ /* invariant: (P, Q) = (V_{k+1}(P0), V_k(P0)), start with k=1 */ size_e = mpz_sizeinbase (e, 2); for (i = size_e - 1; i > 0;) { if (mpz_tstbit (e, --i)) /* k -> 2k+1 */ { if (i) /* Q is not needed for last iteration */ { mpres_mul (Q, P, Q, n); mpres_sub (Q, Q, P0, n); } mpres_sqr (P, P, n); mpres_sub_ui (P, P, 2, n); } else /* k -> 2k */ { mpres_mul (P, P, Q, n); mpres_sub (P, P, P0, n); if (i) /* Q is not needed for last iteration */ { mpres_sqr (Q, Q, n); mpres_sub_ui (Q, Q, 2, n); } } } mpres_set (P1, P, n); mpz_add_ui (e, e, 1); /* recover original value of e */ unnegate: if (sign == -1) mpz_neg (e, e); return; } /* Input: P0 is the initial point (sigma) n is the number to factor B1 is the stage 1 bound B1done: stage 1 was already done up to that limit go: if <> 1, group order to preload Output: a is the factor found, or the value at end of stage 1 B1done is set to B1 if stage 1 completed normally, or to the largest prime processed if interrupted, but never to a smaller value than B1done was upon function entry. Return value: non-zero iff a factor was found. */ static int pp1_stage1 (mpz_t f, mpres_t P0, mpmod_t n, double B1, double *B1done, mpz_t go, int (*stop_asap)(void), char *chkfilename) { double B0, p, q, r, last_chkpnt_p; mpz_t g; mpres_t P, Q; mpres_t R, S, T; int youpi = ECM_NO_FACTOR_FOUND; unsigned int max_size, size_n; long last_chkpnt_time; mpz_init (g); mpres_init (P, n); mpres_init (Q, n); mpres_init (R, n); mpres_init (S, n); mpres_init (T, n); B0 = ceil (sqrt (B1)); size_n = mpz_sizeinbase (n->orig_modulus, 2); max_size = L1 * size_n; if (mpz_cmp_ui (go, 1) > 0) pp1_mul (P0, P0, go, n, P, Q); /* suggestion from Peter Montgomery: start with exponent n^2-1, as factors of Lucas and Fibonacci number are either +/-1 (mod index), and so is n. Therefore, index will appear as a factor of n^2-1 and be included in stage 1. Do this only when n is composite, otherwise all tests with prime n factor of a Cunningham number will succeed in stage 1. As in P-1, for small overhead, use that trick only when lg(n) <= sqrt(B1). */ if ((double) size_n <= B0 && mpz_probab_prime_p (n->orig_modulus, PROBAB_PRIME_TESTS) == 0) { mpz_mul (g, n->orig_modulus, n->orig_modulus); mpz_sub_ui (g, g, 1); pp1_mul (P0, P0, g, n, P, Q); } mpz_set_ui (g, 1); last_chkpnt_p = 2.; last_chkpnt_time = cputime (); /* first loop through small primes <= sqrt(B1) */ for (p = 2.0; p <= B0; p = getprime ()) { for (q = 1, r = p; r <= B1; r *= p) if (r > *B1done) q *= p; mpz_mul_d (g, g, q, Q); if (mpz_sizeinbase (g, 2) >= max_size) { pp1_mul (P0, P0, g, n, P, Q); mpz_set_ui (g, 1); if (stop_asap != NULL && (*stop_asap) ()) { outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); if (p > *B1done) *B1done = p; goto clear_and_exit; } } } pp1_mul (P0, P0, g, n, P, Q); #if 1 /* All primes sqrt(B1) < p <= B1 appear in exponent 1. All primes <= B1done are already included in exponent of at least 1, so it's save to skip ahead to B1done+1 */ if (*B1done > p) { getprime_seek ((*B1done) + 1.); p = getprime (); } #endif /* then all primes > sqrt(B1) and taken with exponent 1 */ for (; p <= B1; p = getprime ()) { pp1_mul_prac (P0, (ecm_uint) p, n, P, Q, R, S, T); if (stop_asap != NULL && (*stop_asap) ()) { outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); if (p > *B1done) *B1done = p; goto clear_and_exit; } if (chkfilename != NULL && p > last_chkpnt_p + 10000. && elltime (last_chkpnt_time, cputime ()) > CHKPNT_PERIOD) { writechkfile (chkfilename, ECM_PP1, p, n, NULL, P0, NULL); last_chkpnt_p = p; last_chkpnt_time = cputime (); } } /* If stage 1 finished normally, p is the smallest prime >B1 here. In that case, set to B1 */ if (p > B1) p = B1; if (p > *B1done) *B1done = p; mpres_sub_ui (P, P0, 2, n); mpres_gcd (f, P, n); youpi = mpz_cmp_ui (f, 1); clear_and_exit: if (chkfilename != NULL) writechkfile (chkfilename, ECM_PP1, p, n, NULL, P0, NULL); getprime_clear (); /* free the prime tables, and reinitialize */ mpres_clear (Q, n); mpres_clear (R, n); mpres_clear (S, n); mpres_clear (T, n); mpz_clear (g); mpres_clear (P, n); return youpi; } /* checks if the factor p was found by P+1 or P-1 (when prime). a is the initial seed. */ static void pp1_check_factor (mpz_t a, mpz_t p) { if (mpz_probab_prime_p (p, PROBAB_PRIME_TESTS)) { mpz_mul (a, a, a); mpz_sub_ui (a, a, 4); if (mpz_jacobi (a, p) == 1) outputf (OUTPUT_NORMAL, "[factor found by P-1]\n"); } } /****************************************************************************** * * * Stage 2 * * * ******************************************************************************/ /* let alpha, beta be the roots of x^2-Px+1=0 set a, b such that alpha^e = a*alpha+b (idem for beta), i.e. a*x+b = rem(x^e, x^2-Px+1). Since (x-alpha)*(x-beta) = x^2-Px+1, we have alpha*beta = 1 and alpha+beta = P, i.e. 1/alpha = beta = -alpha + P. It seems that if x^e % (x^2-Px+1) = a*x+b, then x^{-e+1} % (x^2-Px+1) = b*x+a. Proof? */ static void pp1_mul2 (mpres_t a, mpres_t b, mpres_t P, mpz_t e, mpmod_t n) { unsigned long l; mpres_t t; mpz_t abs_e; const int positive_e = (mpz_sgn (e) > 0); if (mpz_cmp_ui (e, 0UL) == 0) /* x^0 = 1 */ { mpres_set_ui (a, 0, n); mpres_set_ui (b, 1, n); return; } mpres_init (t, n); mpz_init (abs_e); mpz_abs (abs_e, e); if (positive_e) { mpres_set_ui (a, 1, n); mpres_set_ui (b, 0, n); } else { /* Set to -x+P */ mpres_set_ui (a, 1, n); mpres_neg (a, a, n); mpres_set (b, P, n); } l = mpz_sizeinbase (abs_e, 2) - 1; /* number of bits of e (minus 1) */ while (l--) { /* square: (ax+b)^2 = (a^2P+2ab) x + (b^2-a^2) */ mpres_sqr (t, a, n); /* a^2 */ mpres_mul (a, a, b, n); mpres_add (a, a, a, n); /* 2ab */ mpres_sqr (b, b, n); /* b^2 */ mpres_sub (b, b, t, n); /* b^2-a^2 */ mpres_mul (t, t, P, n); /* a^2P */ mpres_add (a, t, a, n); /* a^2P+2ab */ if (mpz_tstbit (abs_e, l)) { if (positive_e) { /* multiply: (ax+b)*x = (aP+b) x - a */ mpres_mul (t, a, P, n); mpres_add (t, t, b, n); mpres_neg (b, a, n); mpres_set (a, t, n); } else { /* multiply: (ax+b)*(-x+P) = -ax^2+(aP-b)x+b*P == -bx + (bP + a) (mod x^2-P*x+1) */ mpres_mul (t, b, P, n); mpres_add (t, t, a, n); mpres_neg (a, b, n); mpres_set (b, t, n); } } } mpz_clear (abs_e); mpres_clear (t, n); } /* Performs the following: for (i=0;id1, root_params->d2, root_params->S, dF); mpres_init (u, modulus); mpres_init (v, modulus); if (ABS(root_params->S) == 1) /* special code with d1/6 muls */ { mpres_init (fd[0], modulus); mpres_init (fd[1], modulus); mpres_init (fd[2], modulus); mpz_set_ui (*t, root_params->d2); pp1_mul (fd[2], *x, *t, modulus, u, v); mpres_get_z (F[0], fd[2], modulus); mpz_set_ui (*t, 7UL); pp1_mul (fd[0], fd[2], *t, modulus, u, v); mpz_set_ui (*t, 6UL); pp1_mul (fd[1], fd[2], *t, modulus, u, v); /* fd[0] = V_{7*d2}(P), fd[1] = V_{6*d2}(P), fd[2] = V_{d2}(P) */ outputf (OUTPUT_VERBOSE, "Initializing table of differences for F took %ldms\n", elltime (st1, cputime ())); i = 1; j = 7; while (i < dF) { if (gcd (j, root_params->d1) == 1) /* (d2,d1) == 1 ==> (j*d2,d1) == (j,d1) */ mpres_get_z (F[i++], fd[0], modulus); /* V_{m+n} = V_m * V_n - V_{m-n} */ /* fd[0] = V_m, fd[1] = V_n, fd[2] = V_{m-n} */ mpres_swap (fd[0], fd[2], modulus); /* fd[0] = V_{m-n}, fd[1] = V_n, fd[2] = V_m */ mpres_mul (u, fd[2], fd[1], modulus); /* u = V_n * V_m */ mpres_sub (fd[0], u, fd[0], modulus); /* fd[0] = V_n * V_m - V_{m-n} = V_{m+n}, hence */ /* fd[0] = V_{m+n}, fd[1] = V_n, fd[2] = V_m */ j += 6; muls ++; } mpres_clear (fd[0], modulus); mpres_clear (fd[1], modulus); mpres_clear (fd[2], modulus); } else /* case |S| <> 1: this code works also for S=1, but is more expensive, since it can use up to 4*(d1/6) muls */ { init_roots_params (params, root_params->S, root_params->d1, root_params->d2, 1.0); mpz_set_ui (*t, 0UL); coeffs = init_progression_coeffs (*t, params->dsieve, root_params->d2, 1, 6, params->S, params->dickson_a); if (coeffs == NULL) return ECM_ERROR; state.fd = (point *) malloc (params->size_fd * sizeof (point)); if (state.fd == NULL) { clear_list (coeffs, params->size_fd); return ECM_ERROR; } for (i = 0; i < params->size_fd; i++) { mpres_init (state.fd[i].x, modulus); mpres_init (state.fd[i].y, modulus); /* if i = k*(S+1) + S for k>=1, we can copy x and y from i - (S+1) */ if (i > params->S && (i % (params->S + 1) == params->S)) { mpres_set (state.fd[i].x, state.fd[params->S].x, modulus); mpres_set (state.fd[i].y, state.fd[params->S].y, modulus); } else pp1_mul2 (state.fd[i].x, state.fd[i].y, x[0], coeffs[i], modulus); } clear_list (coeffs, params->size_fd); outputf (OUTPUT_VERBOSE, "Initializing table of differences for F took %ldms\n", elltime (st1, cputime ())); /* Now for the actual calculation of the roots. */ for (i = 0; i < dF && !youpi;) { /* Is this a rsieve value where we computed Dickson(j * d2) * X? */ if (gcd (params->rsieve, params->dsieve) == 1) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ addWnm (state.fd, x[0], modulus, params->nr, params->S, &muls); params->next = 0; } /* Is this a j value where we want Dickson(j*d2)*X as a root? */ if (gcd (params->rsieve, root_params->d1) == 1) { /* we have alpha^k = x * alpha + y thus alpha^k + beta^k = x * P + 2 * y. FIXME: can we avoid returning to the Lucas form? */ mpres_mul (u, state.fd[params->next * (params->S + 1)].x, x[0], modulus); mpres_add (v, state.fd[params->next * (params->S + 1)].y, state.fd[params->next * (params->S + 1)].y, modulus); mpres_add (u, u, v, modulus); mpres_get_z (F[i++], u, modulus); } params->next ++; } params->rsieve += 6; } for (i = 0; i < params->size_fd; i++) { mpres_clear (state.fd[i].x, modulus); mpres_clear (state.fd[i].y, modulus); } free (state.fd); } mpres_clear (u, modulus); mpres_clear (v, modulus); outputf (OUTPUT_VERBOSE, "Computing roots of F took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, " and %d muls", muls); outputf (OUTPUT_VERBOSE, "\n"); return youpi; } /* return NULL if an error occurred */ pp1_roots_state_t * pp1_rootsG_init (mpres_t *x, root_params_t *root_params, mpmod_t modulus) { mpres_t P; pp1_roots_state_t *state; progression_params_t *params; /* for less typing */ unsigned long i; ASSERT (gcd (root_params->d1, root_params->d2) == 1); state = (pp1_roots_state_t *) malloc (sizeof (pp1_roots_state_t)); if (state == NULL) return NULL; params = &(state->params); /* we don't need the sign anymore after pp1_rootsG_init */ params->S = ABS(root_params->S); if (params->S == 1) { mpz_t t; mpz_init (t); mpres_init (P, modulus); for (i = 0; i < 4; i++) mpres_init (state->tmp[i], modulus); params->dsieve = root_params->d2; /* needed in pp1_rootsG */ /* We want to skip values where gcd((i0 + i) * d1, d2) != 1. We can test for gcd(i0 + i, d2) instead and let pp1_rootsG() advance params->rsieve in steps of 1 */ /* params->rsieve = i0 % d2 */ params->rsieve = mpz_fdiv_ui (root_params->i0, root_params->d2); outputf (OUTPUT_DEVVERBOSE, "pp1_rootsG_init: i0 = %Zd, state: " "dsieve = %d, rsieve = %d, S = %d\n", root_params->i0, params->dsieve, params->rsieve, params->S); mpz_set_ui (t, root_params->d1); pp1_mul (state->tmp[1], *x, t, modulus, state->tmp[3], P); pp1_mul (state->tmp[0], state->tmp[1], root_params->i0, modulus, state->tmp[3], P); mpz_sub_ui (t, root_params->i0, 1); mpz_abs (t, t); pp1_mul (state->tmp[2], state->tmp[1], t, modulus, state->tmp[3], P); /* for P+1, tmp[0] = V_s(P), tmp[1] = V_d1(P), tmp[2] = V_{|s-d1|}(P) */ mpres_clear (P, modulus); mpz_clear (t); } else { listz_t coeffs; params->dickson_a = (root_params->S < 0) ? -1 : 0; params->nr = (root_params->d2 > 1) ? root_params->d2 - 1 : 1; params->size_fd = params->nr * (params->S + 1); params->next = 0; params->dsieve = 1; params->rsieve = 1; state->fd = (point *) malloc (params->size_fd * sizeof (point)); if (state->fd == NULL) { free (state); return NULL; } coeffs = init_progression_coeffs (root_params->i0, root_params->d2, root_params->d1, 1, 1, params->S, params->dickson_a); if (coeffs == NULL) { free (state->fd); free (state); return NULL; } for (i = 0; i < params->size_fd; i++) { mpres_init (state->fd[i].x, modulus); mpres_init (state->fd[i].y, modulus); /* The S-th coeff of all progressions is identical */ if (i > params->S && i % (params->S + 1) == params->S) { /* Simply copy from the first progression */ mpres_set (state->fd[i].x, state->fd[params->S].x, modulus); mpres_set (state->fd[i].y, state->fd[params->S].y, modulus); } else pp1_mul2 (state->fd[i].x, state->fd[i].y, x[0], coeffs[i], modulus); } clear_list (coeffs, params->size_fd); } return state; } void pp1_rootsG_clear (pp1_roots_state_t *state, ATTRIBUTE_UNUSED mpmod_t modulus) { unsigned long i; if (state->params.S == 1) { for (i = 0; i < 4; i++) mpres_clear (state->tmp[i], modulus); } else { for (i = 0; i < state->params.size_fd; i++) { mpres_clear (state->fd[i].x, modulus); mpres_clear (state->fd[i].y, modulus); } free (state->fd); } free (state); } int pp1_rootsG (listz_t G, unsigned long dF, pp1_roots_state_t *state, mpmod_t modulus, mpres_t *x) { unsigned long i; unsigned long muls = 0; long st; progression_params_t *params = &(state->params); /* for less typing */ st = cputime (); /* params->S is positive: we don't need the sign anymore, since the polynomial is defined by the table of differences */ if (params->S == 1) { for (i = 0; i < dF;) { if (gcd (params->rsieve, params->dsieve) == 1) { outputf (OUTPUT_TRACE, "pp1_rootsG: Taking root G[%d], rsieve = %d\n", i, params->rsieve); mpres_get_z (G[i++], state->tmp[0], modulus); } else { outputf (OUTPUT_TRACE, "pp1_rootsG: NOT taking root, rsieve = %d, gcd = %d\n", params->rsieve, gcd (params->rsieve, params->dsieve)); } mpres_swap (state->tmp[0], state->tmp[2], modulus); mpres_mul (state->tmp[3], state->tmp[2], state->tmp[1], modulus); mpres_sub (state->tmp[0], state->tmp[3], state->tmp[0], modulus); params->rsieve++; } } else { mpres_t u, v; mpres_init (u, modulus); mpres_init (v, modulus); for (i = 0; i < dF;) { /* Did we use every progression since the last update? */ if (params->next == params->nr) { /* Yes, time to update again */ addWnm (state->fd, x[0], modulus, params->nr, params->S, &muls); params->next = 0; } /* Is this a root we should skip? (Take only if gcd == 1) */ if (gcd (params->rsieve, params->dsieve) == 1) { mpres_mul (u, state->fd[params->next * (params->S + 1)].x, x[0], modulus); mpres_add (v, state->fd[params->next * (params->S + 1)].y, state->fd[params->next * (params->S + 1)].y, modulus); mpres_add (u, u, v, modulus); mpres_get_z (G[i++], u, modulus); } params->next ++; params->rsieve ++; } mpres_clear (u, modulus); mpres_clear (v, modulus); } outputf (OUTPUT_VERBOSE, "Computing roots of G took %ldms", elltime (st, cputime ())); outputf (OUTPUT_DEVVERBOSE, ", %lu muls", dF); outputf (OUTPUT_VERBOSE, "\n"); return ECM_NO_FACTOR_FOUND; } /****************************************************************************** * * * Williams P+1 * * * ******************************************************************************/ /* Input: p is the initial generator (sigma), if 0 generate it at random. n is the number to factor B1 is the stage 1 bound B2 is the stage 2 bound k is the number of blocks for stage 2 verbose is the verbosity level Output: p is the factor found Return value: non-zero iff a factor is found (1 for stage 1, 2 for stage 2) */ int pp1 (mpz_t f, mpz_t p, mpz_t n, mpz_t go, double *B1done, double B1, mpz_t B2min_parm, mpz_t B2_parm, double B2scale, unsigned long k, const int S, int verbose, int repr, int use_ntt, FILE *os, FILE *es, char *chkfilename, char *TreeFilename, double maxmem, gmp_randstate_t rng, int (*stop_asap)(void)) { int youpi = ECM_NO_FACTOR_FOUND; int po2 = 0; /* Whether we should use power-of-2 poly degree */ long st; mpres_t a; mpmod_t modulus; mpz_t B2min, B2; /* Local B2, B2min to avoid changing caller's values */ unsigned long dF; root_params_t root_params; faststage2_param_t faststage2_params; const int stage2_variant = (S == 1 || S == ECM_DEFAULT_S); int twopass = 0; set_verbose (verbose); ECM_STDOUT = (os == NULL) ? stdout : os; ECM_STDERR = (es == NULL) ? stdout : es; /* if n is even, return 2 */ if (mpz_divisible_2exp_p (n, 1)) { mpz_set_ui (f, 2); return ECM_FACTOR_FOUND_STEP1; } st = cputime (); if (mpz_cmp_ui (p, 0) == 0) pm1_random_seed (p, n, rng); mpz_init_set (B2min, B2min_parm); mpz_init_set (B2, B2_parm); /* Set default B2. See ecm.c for comments */ if (ECM_IS_DEFAULT_B2(B2)) { if (stage2_variant == 0) mpz_set_d (B2, B2scale * pow (B1 * PP1_COST, DEFAULT_B2_EXPONENT)); else mpz_set_d (B2, B2scale * pow (B1 * PP1FS2_COST, PM1FS2_DEFAULT_B2_EXPONENT)); } /* set B2min */ if (mpz_sgn (B2min) < 0) mpz_set_d (B2min, B1); mpmod_init (modulus, n, repr); if (use_ntt) po2 = 1; if (stage2_variant != 0) { long P; const unsigned long lmax = 1UL<<28; /* An upper bound */ unsigned long lmax_NTT, lmax_noNTT; mpz_init (faststage2_params.m_1); faststage2_params.l = 0; /* Find out what the longest transform length is we can do at all. If no maxmem is given, the non-NTT can theoretically do any length. */ lmax_NTT = 0; if (use_ntt) { unsigned long t, t2 = 0; /* See what transform length that the NTT can handle (due to limited primes and limited memory) */ t = mpzspm_max_len (n); lmax_NTT = MIN (lmax, t); if (maxmem != 0.) { t = pp1fs2_maxlen (double_to_size (maxmem), n, use_ntt, 0); t = MIN (t, lmax_NTT); /* Maybe the two pass variant lets us use a longer transform */ t2 = pp1fs2_maxlen (double_to_size (maxmem), n, use_ntt, 1); t2 = MIN (t2, lmax_NTT); if (t2 > t) { t = t2; twopass = 1; } lmax_NTT = t; } outputf (OUTPUT_DEVVERBOSE, "NTT can handle lmax <= %lu\n", lmax_NTT); } /* See what transform length that the non-NTT code can handle */ lmax_noNTT = lmax; if (maxmem != 0.) { unsigned long t; t = pp1fs2_maxlen (double_to_size (maxmem), n, 0, 0); lmax_noNTT = MIN (lmax_noNTT, t); outputf (OUTPUT_DEVVERBOSE, "non-NTT can handle lmax <= %lu\n", lmax_noNTT); } P = choose_P (B2min, B2, MAX(lmax_noNTT, lmax_NTT), k, &faststage2_params, B2min, B2, use_ntt, ECM_PP1); if (P == ECM_ERROR) { outputf (OUTPUT_ERROR, "Error: cannot choose suitable P value for your stage 2 " "parameters.\nTry a shorter B2min,B2 interval.\n"); mpz_clear (faststage2_params.m_1); return ECM_ERROR; } /* See if the selected parameters let us use NTT or not */ if (faststage2_params.l > lmax_NTT) use_ntt = 0; if (maxmem != 0.) { unsigned long MB; char *s; if (!use_ntt) s = "out"; else if (twopass) s = " two pass"; else s = " one pass"; MB = pp1fs2_memory_use (faststage2_params.l, n, use_ntt, twopass) / 1048576; outputf (OUTPUT_VERBOSE, "Using lmax = %lu with%s NTT which takes " "about %luMB of memory\n", faststage2_params.l, s, MB); } } else { mpz_init (root_params.i0); root_params.d2 = 0; /* Enable automatic choice of d2 */ if (bestD (&root_params, &k, &dF, B2min, B2, po2, use_ntt, maxmem, (TreeFilename != NULL), modulus) == ECM_ERROR) { youpi = ECM_ERROR; goto clear_and_exit; } /* Set default degree for Brent-Suyama extension */ root_params.S = S; if (root_params.S == ECM_DEFAULT_S) { if (modulus->repr == ECM_MOD_BASE2 && modulus->Fermat > 0) { /* For Fermat numbers, default is 1 (no Brent-Suyama) */ root_params.S = 1; } else { mpz_t t; mpz_init (t); mpz_sub (t, B2, B2min); root_params.S = choose_S (t); mpz_clear (t); } } } /* Print B1, B2, polynomial and x0 */ print_B1_B2_poly (OUTPUT_NORMAL, ECM_PP1, B1, *B1done, B2min_parm, B2min, B2, (stage2_variant == 0) ? root_params.S : 1, p, 0, NULL); /* If we do a stage 2, print its parameters */ if (mpz_cmp (B2, B2min) >= 0) { if (stage2_variant != 0) outputf (OUTPUT_VERBOSE, "P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu, " "m_1 = %Zd\n", faststage2_params.P, faststage2_params.l, faststage2_params.s_1,faststage2_params.s_2, faststage2_params.m_1); else outputf (OUTPUT_VERBOSE, "dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", dF, k, root_params.d1, root_params.d2, S == 1 ? faststage2_params.m_1 : root_params.i0); } mpres_init (a, modulus); mpres_set_z (a, p, modulus); /* since pp1_mul_prac takes an ecm_uint, we have to check that B1 <= ECM_UINT_MAX */ if (B1 > (double) ECM_UINT_MAX) { outputf (OUTPUT_ERROR, "Error, maximal step1 bound for P+1 is %lu\n", ECM_UINT_MAX); youpi = ECM_ERROR; goto clear_and_exit; } if (B1 > *B1done) youpi = pp1_stage1 (f, a, modulus, B1, B1done, go, stop_asap, chkfilename); outputf (OUTPUT_NORMAL, "Step 1 took %ldms\n", elltime (st, cputime ())); if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_t t; mpz_init (t); mpres_get_z (t, a, modulus); outputf (OUTPUT_RESVERBOSE, "x=%Zd\n", t); mpz_clear (t); } mpres_get_z (p, a, modulus); if (stop_asap != NULL && (*stop_asap) ()) goto clear_and_exit; if (youpi == ECM_NO_FACTOR_FOUND && mpz_cmp (B2, B2min) >= 0) { if (stage2_variant != 0) { if (use_ntt) youpi = pp1fs2_ntt (f, a, modulus, &faststage2_params, twopass); else youpi = pp1fs2 (f, a, modulus, &faststage2_params); } else youpi = stage2 (f, &a, modulus, dF, k, &root_params, ECM_PP1, use_ntt, TreeFilename, stop_asap); } if (youpi > 0 && test_verbose (OUTPUT_NORMAL)) pp1_check_factor (p, f); /* tell user if factor was found by P-1 */ clear_and_exit: mpres_clear (a, modulus); mpmod_clear (modulus); if (stage2_variant != 0) mpz_clear (faststage2_params.m_1); else mpz_clear (root_params.i0); mpz_clear (B2); mpz_clear (B2min); return youpi; } ecm-6.4.4/README.lib0000644023561000001540000001066512106741273010653 00000000000000This is the README file for the ecm library. To use the library, you need to add the following line in your source file: #include "ecm.h" and link with -lecm. The public interface is defined in the "ecm.h" file. It contains the following functions: int ecm_factor (mpz_t f, mpz_t n, double B1, ecm_params p) where n is the number to factor, f is the factor found (if any), B1 is the stage 1 bound, and p contains auxiliary parameters (see below). When p is NULL, default values for those parameters are chosen. The ecm_factor() function returns: * a positive value if a factor was found (1 for step 1, 2 for step 2), * zero when no factor was found, * a negative value when an error occurred. void ecm_init (ecm_params p) Initialize the parameters to default values. void ecm_clear (ecm_params p) Clear the parameters. Detailed description of parameters (ecm_params): * p->method is the factorization method (ECM_ECM for ECM, ECM_PM1 for P-1, ECM_PP1 for P+1). Default is ECM_ECM. * p->x (if non zero) is the starting point (ECM, P-1, P+1). For ECM, we take as starting point (x0 : y0) where x0=x, y0=1; for P-1, we take x0; for P+1, we take x0 as starting point of the Lucas sequence. When ecm_factor() returns, p->x is the point obtained after stage 1. * p->sigma (ECM only) is the "sigma" parameter. The elliptic curve chosen is b*y^2 = x^3 + a*x^2 + x where a = (v-u)^3*(3*u+v)/(4*u^3*v)-2, u = sigma^2-5, v = 4*sigma (Suyama's parametrization). The initial point (if p->x is zero) is taken as x0=u^3/v^3, y0=1 (thus b is taken as x0^3 + a*x0^2 + x0). * p->sigma_is_A (ECM only) indicates that p->sigma is the 'a' parameter from the elliptic curve. * p->go is the initial group order to preload (default is 1). * p->B1done tells that step 1 was already done up to B1done. This means that all prime powers <= B1done were dealt with. If for example B1done=100 and B1=200, prime 2 was dealt with up to power 6, thus it remains to "multiply" once by 2 to go up to power 7. Of course, all primes p such that B1done < p <= B1 will be considered with power 1. * p->B2min is the lower bound for stage 2, which will treat all primes p such that B2min <= p <= B2. If negative, B2min will be set to B1. * p->B2 is the upper bound for stage 2 (default is automatically computed from B1, to optimize the efficiency of the method). * p->k is the number of blocks used in stage 2 (default is ECM_DEFAULT_K). * p->S defines the polynomial used for Brent-Suyama's extension in stage 2. If positive, the polynomial used is x^S; if negative, it is Dickson's polynomial of degree S with parameter a=-1, where D_{1,a}(x) = x, D_{2,a}(x) = x^2-2*a, and D_{k+2,a}(x) = x*D_{k+1,a}(x) - a*D_{k,a}(x), or equivalently D_{k,a}(2*sqrt(a)*cos(t)) = 2*a^(k/2)*cos(k*t). If zero, choice is automatic (and should be close to optimal). Default is ECM_DEFAULT_S. * p->repr defines the representation used for modular arithmetic: 1 means the 'mpz' class from GMP, 2 means 'modmuln' (Montgomery's multiplication, quadratic implementation), 3 means 'redc' (Montgomery's multiplication, subquadratic implementation), -1 indicates not to use a special base-2 representation (when the input number is a factor of 2^n +/- 1). Other values (including 0) mean the representation will be chosen automatically (hopefully in some optimal way). * p->verbose is the verbosity level: 0 for no output, 1 for normal output (like default for GMP-ECM), 2 for diagnostic output without inter- mediate residues (like -v in GMP-ECM), 3 for diagnostic output with residues (like -v -v), 4 for high diagnostic output (-v -v -v), and 5 for trace output (-v -v -v -v). * p->os is the output stream used for verbose output. Default is stdout. * p->es is the output stream used for errors. Default is stderr. * p->TreeFilename if non NULL, is the file name to store the product tree of F (option -treefile f). * p->maxmem is the maximum amount of memory in bytes that should be used in stage 2. Setting this value too low (< 10MB, say) will cause stage 2 to perform very poorly, or return with an error code. * p->stage1time is the time already spent in stage 1 (useful to get a correct estimation of the expected time to find factors). * p->rng is a random number generator state. * p->use_ntt if equal to 1, use NTT in stage 2. * p->(*stop_asap) pointer to function: if the function returns zero, continue normally, otherwise exit as soon as possible. May be NULL. ecm-6.4.4/getprime.c0000644023561000001540000002171312106741273011202 00000000000000/* Dynamic Eratosthenes sieve. Copyright 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2012 Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #ifdef OUTSIDE_LIBECM # include "ecm-ecm.h" #else # include "ecm-impl.h" #endif /* This function returns successive odd primes, starting with 3. To perform a loop over all primes <= B1, do the following (compile this file with -DMAIN to count primes): for (p = 2.0; p <= B1; p = getprime ()) { ... } It is slightly less efficient (1.5 to 2 times) than Dan Bernstein's primegen library (http://cr.yp.to/primegen.html), however it is fast enough for our usage here. */ /* We allow primes up to 2^53. This means len, the primes in primes[] etc. will stay well below 2^32. */ static double offset = 0.0; /* offset for current primes, must be 0 or odd */ static int current = -1; /* index of previous prime */ static unsigned int *primes = NULL; /* table of small primes up to sqrt(p) */ static unsigned int nprimes = 0; /* length of primes[] */ static unsigned char *sieve = NULL; /* sieving table */ static int len = 0; /* length of sieving table, WITHOUT sentinel */ static unsigned int *moduli = NULL; /* offset for small primes, moduli[i] = offset mod primes[i] */ /* sieve[i] == 1 if offset+2*i is a prime, otherwise sieve[i] == 0. sieve has len + 1 bytes allocated, the last byte is always 1 (a sentinel). This allows us avoid testing for the array end in the loop that looks for the next prime in sieve[]. */ /* The last prime returned by getprime is offset + 2*current */ /* primes[] contains small primes to needed to sieve out composites in sieve, i.e. all primes <= sqrt(offset + 2 * (len - 1)). moduli[i] contains the smallest k so that offset+2*(len+k) is divisible by primes[i], i.e. after advancing the sieve array by len, sieve[moduli[i]] is divisible by primes[i]. */ void getprime_clear () { offset = 0.0; current = -1; free (primes); primes = NULL; nprimes = 0; free (sieve); sieve = NULL; len = 0; free (moduli); moduli = NULL; } /* For p > 1, return 1 if p is prime and 0 if p is not prime. Requires that all primes <= sqrt(p) are in *primes */ static int isprime_ui (unsigned int p, unsigned int *primes) { int i; for (i = 0; primes[i] * primes[i] <= p; i++) if (p % primes[i] == 0) return 0; return 1; } double getprime () { /* the following complex block is equivalent to: while ((++current < len) && (sieve[current] == 0)); but is faster. */ if (len > 0L) { unsigned char *ptr = sieve + current; while (*(++ptr) == 0); current = ptr - sieve; } else current = len; if (current < len) /* most calls will end here */ return offset + 2.0 * (double) current; /* otherwise we have to advance the sieve */ offset += 2.0 * (double) len; /* first enlarge sieving table if too small */ if ((double) len * (double) len < offset && len > 0) { free (sieve); len *= 2; sieve = (unsigned char *) malloc ((len + 1) * sizeof (unsigned char)); /* assume this "small" malloc will not fail in normal usage */ if (sieve == NULL) { fprintf (stderr, "Cannot allocate memory in getprime\n"); exit (1); } } /* now enlarge small prime table if too small */ if ((nprimes == 0) || (primes[nprimes-1] < sqrt(offset + 2*len))) { if (nprimes == 0) /* initialization */ { nprimes = 1; primes = (unsigned int *) malloc (nprimes * sizeof(unsigned int)); /* assume this "small" malloc will not fail in normal usage */ ASSERT(primes != NULL); moduli = (unsigned int *) malloc (nprimes * sizeof(unsigned int)); /* assume this "small" malloc will not fail in normal usage */ ASSERT(moduli != NULL); len = 1; sieve = (unsigned char *) malloc((len + 1) * sizeof(unsigned char)); /* len=1 here */ /* assume this "small" malloc will not fail in normal usage */ ASSERT(sieve != NULL); offset = 5.0; sieve[0] = 1; /* corresponding to 5 */ sieve[1] = 1; /* place the sentinel */ primes[0] = 3; moduli[0] = 1; /* After we advance sieve[], sieve[0] will correspond to 7 and sieve[1] to 9, which is the smallest odd multiple of 3 */ current = -1; return 3.0; } else { /* extend the existing table of small primes */ unsigned int i, j; i = nprimes; nprimes *= 2; primes = (unsigned int *) realloc (primes, nprimes * sizeof(unsigned int)); moduli = (unsigned int *) realloc (moduli, nprimes * sizeof(unsigned int)); /* assume those "small" realloc's will not fail in normal usage */ ASSERT_ALWAYS(primes != NULL && moduli != NULL); for (; i < nprimes; i++) { unsigned int p; /* find next (odd) prime */ for (p = primes[i - 1] + 2; !isprime_ui (p, primes); p += 2); primes[i] = p; /* moduli[i] is the smallest m such that offset + 2*m = k*p */ j = (unsigned long) fmod (offset, (double) p); j = (j == 0) ? j : p - j; /* -offset mod p */ if ((j % 2) != 0) j += p; /* ensure j is even */ moduli[i] = j / 2; } } } /* now sieve for new primes */ { int i, p; unsigned int j; /* Set sieve (including sentinel at the end) to 1 */ for (i = 0; i < len + 1; i++) sieve[i] = 1; for (j = 0; j < nprimes; j++) { p = primes[j]; for (i = moduli[j]; i < len; i += p) sieve[i] = 0; moduli[j] = i - len; /* for next sieving array */ } } current = -1; while (sieve[++current] == 0); ASSERT(current < len); /* otherwise we found a prime gap >= sqrt(x) around x */ return offset + 2.0 * (double) current; } /* Skips forward or backward in the sieve so that the next call to getprime returns the smallest prime >= pp */ void getprime_seek (double pp) { int i, p; unsigned int j; if (pp <= 3.) { getprime_clear (); return; } offset = floor (pp / 2.) * 2. + 1.; /* make sure offset is odd */ /* Choose a large enough sieve array length */ for (i = 2; (double) i * (double) i < offset; i *= 2); /* Now allocate sieving table */ if (len > 0) free (sieve); len = i; sieve = (unsigned char *) malloc ((len + 1) * sizeof (unsigned char)); /* assume this "small" malloc will not fail in normal usage */ ASSERT_ALWAYS(sieve != NULL); j = 1; /* Find out how many small odd primes we'll need */ for (p = 5; (double)p*(double)p <= offset + (double)(2*len); p += 2) { for (i = 3; i*i <= p && p % i != 0; i += 2); if (i*i <= p) continue; if ((double)p*(double)p < offset + (double)len) j++; } /* Allocate memory for small primes */ if (nprimes != 0) { free (primes); free (moduli); } nprimes = j; primes = (unsigned int *) malloc (nprimes * sizeof(unsigned int)); moduli = (unsigned int *) malloc (nprimes * sizeof(unsigned int)); ASSERT_ALWAYS(primes != NULL && moduli != NULL); /* Fill small primes and moduli arrays */ for (p = 3, j = 0; j < nprimes; p += 2) { for (i = 3; i*i <= p && p % i != 0; i += 2); if (i*i <= p) continue; primes[j] = p; i = (unsigned int) fmod (offset, (double)p); i = (i == 0) ? i : p - i; /* -offset mod p */ if (i % 2 != 0) i += p; /* ensure i is even */ moduli[j] = i / 2; j++; } /* now sieve for new primes */ for (i = 0; i < len + 1; i++) sieve[i] = 1; for (j = 0; j < nprimes; j++) { p = primes[j]; for (i = moduli[j]; i < len; i += p) sieve[i] = 0; moduli[j] = i - len; /* for next sieving array */ } current = -1; } #ifdef MAIN int main (int argc, char *argv[]) { double p, B1, B2; unsigned long pi = 0; if (argc != 3) { fprintf (stderr, "Usage: getprime \n"); exit (EXIT_FAILURE); } B1 = atof (argv[1]); B2 = atof (argv[2]); if (B1 > 0.) getprime_seek (B1); p = 0; if (B1 <= 2) { printf("2\n"); pi++; } for (p = getprime (); p <= B2; p = getprime (), pi++) printf("%1.0f\n", p); /* printf ("pi(%1.0f) - pi(%1.0f - 1) = %lu\n", B2, B1, pi); */ getprime_clear (); return 0; } #endif ecm-6.4.4/bench_mulredc.c0000644023561000001540000004306012106741273012157 00000000000000#include "config.h" #include #include #include /* for LONG_MAX */ #include #include #include #if TIME_WITH_SYS_TIME # include # include #else # if HAVE_SYS_TIME_H # include # else # include # endif #endif #define LOOPCOUNT 10000000UL #define MAXSIZE 20 int tune_mul[MAXSIZE+1], tune_sqr[MAXSIZE+1]; #include #ifdef USE_ASM_REDC #include "mulredc.h" #endif #include "mpmod.h" #ifdef HAVE___GMPN_REDC_1 #ifndef __gmpn_redc_1 void __gmpn_redc_1 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); #endif #endif #ifdef HAVE___GMPN_REDC_2 #ifndef __gmpn_redc_2 void __gmpn_redc_2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif #endif #ifdef HAVE___GMPN_REDC_N #ifndef __gmpn_redc_N void __gmpn_redc_n (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif #endif /* cputime () gives the elapsed time in milliseconds */ #if defined (_WIN32) /* First case - GetProcessTimes () is the only known way of getting process * time (as opposed to calendar time) under mingw32 */ #include long cputime () { FILETIME lpCreationTime, lpExitTime, lpKernelTime, lpUserTime; ULARGE_INTEGER n; HANDLE hProcess = GetCurrentProcess(); GetProcessTimes (hProcess, &lpCreationTime, &lpExitTime, &lpKernelTime, &lpUserTime); /* copy FILETIME to a ULARGE_INTEGER as recommended by MSDN docs */ n.u.LowPart = lpUserTime.dwLowDateTime; n.u.HighPart = lpUserTime.dwHighDateTime; /* lpUserTime is in units of 100 ns. Return time in milliseconds */ return (long) (n.QuadPart / 10000); } #elif defined (HAVE_GETRUSAGE) /* Next case: getrusage () has higher resolution than clock () and so is preferred. */ #ifdef HAVE_SYS_TYPES_H # include #endif #ifdef HAVE_SYS_RESOURCE_H # include #endif long cputime () { struct rusage rus; getrusage (RUSAGE_SELF, &rus); /* This overflows a 32 bit signed int after 2147483s = 24.85 days */ return rus.ru_utime.tv_sec * 1000L + rus.ru_utime.tv_usec / 1000L; } #else /* Resort to clock (), which on some systems may return calendar time. */ long cputime () { /* Return time in milliseconds */ return (long) (clock () * (1000. / (double) CLOCKS_PER_SEC)); } #endif /* defining cputime () */ void mp_print(mp_limb_t *x, int N) { int i; for (i = 0; i < N-1; ++i) gmp_printf("%Nd + W*(", x + i, 1); gmp_printf("%Nd", x + (N-1), 1); for (i = 0; i < N-1; ++i) printf(")"); printf("\n"); } static void ecm_redc_1_svoboda (mp_ptr rp, mp_ptr tmp, mp_srcptr np, mp_size_t nn, mp_limb_t invm, mp_srcptr sp) { mp_size_t j; mp_limb_t t0, cy; /* instead of adding {np, nn} * (invm * tmp[0] mod B), we add {sp, nn} * tmp[0], where {np, nn} * invm = B * {sp, nn} - 1 */ for (j = 0; j < nn - 1; j++, tmp++) rp[j + 1] = mpn_addmul_1 (tmp + 1, sp, nn, tmp[0]); /* for the last step, we reduce with {np, nn} */ t0 = mpn_addmul_1 (tmp, np, nn, tmp[0] * invm); tmp ++; rp[0] = tmp[0]; cy = mpn_add_n (rp + 1, rp + 1, tmp + 1, nn - 1); rp[nn-1] += t0; cy += rp[nn-1] < t0; if (cy != 0) mpn_sub_n (rp, rp, np, nn); /* a borrow should always occur here */ } void bench(mp_size_t N) { mp_limb_t *x, *y, *z, *m, *invm, *tmp, *svoboda1; unsigned long i; unsigned long iter; long tmul, tsqr, tredc_1, t_mulredc_1, tsvoboda1 = 0, t_sqrredc_1; long tmul_best = LONG_MAX, tsqr_best = LONG_MAX, tredc_best = LONG_MAX; mpz_t M, B; #ifdef USE_ASM_REDC long t2; #endif #ifdef HAVE_NATIVE_MULREDC1_N long t3 = 0; #endif #ifdef HAVE___GMPN_REDC_2 long tredc_2, t_mulredc_2, t_sqrredc_2; #endif #ifdef HAVE___GMPN_REDC_N long tredc_n, t_mulredc_n, t_sqrredc_n; #endif x = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); y = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); z = (mp_limb_t *) malloc((2*N)*sizeof(mp_limb_t)); m = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); tmp = (mp_limb_t *) malloc((2*N+2)*sizeof(mp_limb_t)); invm = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); svoboda1 = (mp_limb_t *) malloc(N*sizeof(mp_limb_t)); mpn_random(m, N); m[0] |= 1UL; if (m[N-1] == 0) m[N-1] = 1UL; mpz_init (M); mpz_init (B); mpz_set_ui (M, m[1]); mpz_mul_2exp (M, M, GMP_NUMB_BITS); mpz_add_ui (M, M, m[0]); mpz_set_ui (B, 1); mpz_mul_2exp (B, B, 2 * GMP_NUMB_BITS); mpz_invert (M, M, B); mpz_sub (M, B, M); for (i = 0; i < (unsigned) N; i++) invm[i] = mpz_getlimbn(M, i); tmp[N] = mpn_mul_1 (tmp, m, N, invm[0]); /* {tmp,N+1} should be = -1 mod B */ mpn_add_1 (tmp, tmp, N + 1, 1); /* now = 0 mod B */ mpn_copyi (svoboda1, tmp + 1, N); mpz_clear (M); mpz_clear (B); mpn_random(x, N); mpn_random(y, N); /* we set 'iter' to get about 100ms for each test */ tmul = cputime(); i = 0; iter = 1; do { iter = 2 * iter; for (; i < iter; i++) mpn_mul_n (tmp, x, y, N); } while (cputime() - tmul < 100); iter = (long) (((double) iter * 100.0) / (double) (cputime() - tmul)); tmul = cputime(); for (i = 0; i < iter; ++i) mpn_mul_n(tmp, x, y, N); tmul = cputime()-tmul; tsqr = cputime(); for (i = 0; i < iter; ++i) mpn_sqr (tmp, x, N); tsqr = cputime()-tsqr; #ifdef HAVE___GMPN_REDC_1 mpn_mul_n(tmp, x, y, N); tredc_1 = cputime(); for (i = 0; i < iter; ++i) __gmpn_redc_1 (z, tmp, m, N, invm[0]); tredc_1 = cputime()-tredc_1; if (tredc_1 < tredc_best) tredc_best = tredc_1; #endif if (N > 1) /* Svoboda only works for N > 1 */ { mpn_mul_n(tmp, x, y, N); tsvoboda1 = cputime(); for (i = 0; i < iter; ++i) ecm_redc_1_svoboda (z, tmp, m, N, invm[0], svoboda1); tsvoboda1 = cputime()-tsvoboda1; if (tsvoboda1 < tredc_best) tredc_best = tsvoboda1; } #ifdef HAVE___GMPN_REDC_2 mpn_mul_n(tmp, x, y, N); tredc_2 = cputime(); for (i = 0; i < iter; ++i) __gmpn_redc_2 (z, tmp, m, N, invm); tredc_2 = cputime()-tredc_2; if (tredc_2 < tredc_best) tredc_best = tredc_2; #endif #ifdef HAVE___GMPN_REDC_N mpn_mul_n(tmp, x, y, N); tredc_n = cputime(); for (i = 0; i < iter; ++i) __gmpn_redc_n (z, tmp, m, N, invm); tredc_n = cputime()-tredc_n; if (tredc_n < tredc_best) tredc_best = tredc_n; #endif #ifdef USE_ASM_REDC /* Mixed mul and redc */ t2 = cputime(); switch (N) { case 1: for (i=0; i < iter; ++i) { mulredc1(z, x[0], y[0], m[0], invm[0]); x[0] += tmp[0]; } break; case 2: for (i=0; i < iter; ++i) { mulredc2(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 3: for (i=0; i < iter; ++i) { mulredc3(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 4: for (i=0; i < iter; ++i) { mulredc4(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 5: for (i=0; i < iter; ++i) { mulredc5(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 6: for (i=0; i < iter; ++i) { mulredc6(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 7: for (i=0; i < iter; ++i) { mulredc7(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 8: for (i=0; i < iter; ++i) { mulredc8(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 9: for (i=0; i < iter; ++i) { mulredc9(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 10: for (i=0; i < iter; ++i) { mulredc10(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 11: for (i=0; i < iter; ++i) { mulredc11(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 12: for (i=0; i < iter; ++i) { mulredc12(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 13: for (i=0; i < iter; ++i) { mulredc13(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 14: for (i=0; i < iter; ++i) { mulredc14(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 15: for (i=0; i < iter; ++i) { mulredc15(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 16: for (i=0; i < iter; ++i) { mulredc16(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 17: for (i=0; i < iter; ++i) { mulredc17(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 18: for (i=0; i < iter; ++i) { mulredc18(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 19: for (i=0; i < iter; ++i) { mulredc19(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; case 20: for (i=0; i < iter; ++i) { mulredc20(z, x, y, m, invm[0]); x[0] += tmp[0]; } break; default: for (i=0; i < iter; ++i) { mulredc20(z, x, y, m, invm[0]); x[0] += tmp[0]; } } t2 = cputime()-t2; if (t2 < tmul_best) { tmul_best = t2; tune_mul[N] = MPMOD_MULREDC; } if (t2 < tsqr_best) { tsqr_best = t2; tune_sqr[N] = MPMOD_MULREDC; } #endif /* Mul followed by mpn_redc_1 */ #ifdef HAVE___GMPN_REDC_1 t_mulredc_1 = cputime(); for (i = 0; i < iter; ++i) { mpn_mul_n(tmp, x, y, N); __gmpn_redc_1 (z, tmp, m, N, invm[0]); x[0] += tmp[0]; } t_mulredc_1 = cputime()-t_mulredc_1; if (t_mulredc_1 < tmul_best) { tune_mul[N] = MPMOD_MUL_REDC1; tmul_best = t_mulredc_1; } #endif /* Mul followed by mpn_redc_2 */ #ifdef HAVE___GMPN_REDC_2 t_mulredc_2 = cputime(); for (i = 0; i < iter; ++i) { mpn_mul_n(tmp, x, y, N); __gmpn_redc_2 (z, tmp, m, N, invm); x[0] += tmp[0]; } t_mulredc_2 = cputime()-t_mulredc_2; if (t_mulredc_2 < tmul_best) { tune_mul[N] = MPMOD_MUL_REDC2; tmul_best = t_mulredc_2; } #endif /* Mul followed by mpn_redc_n */ #ifdef HAVE___GMPN_REDC_N t_mulredc_n = cputime(); for (i = 0; i < iter; ++i) { mpn_mul_n (tmp, x, y, N); __gmpn_redc_n (z, tmp, m, N, invm); } t_mulredc_n = cputime()-t_mulredc_n; if (t_mulredc_n < tmul_best) { tune_mul[N] = MPMOD_MUL_REDCN; tmul_best = t_mulredc_n; } #endif /* Sqr followed by mpn_redc_1 */ #ifdef HAVE___GMPN_REDC_1 t_sqrredc_1 = cputime(); for (i = 0; i < iter; ++i) { mpn_sqr(tmp, x, N); __gmpn_redc_1 (z, tmp, m, N, invm[0]); x[0] += tmp[0]; } t_sqrredc_1 = cputime()-t_sqrredc_1; if (t_sqrredc_1 < tsqr_best) { tune_sqr[N] = MPMOD_MUL_REDC1; tsqr_best = t_sqrredc_1; } #endif /* Sqr followed by mpn_redc_2 */ #ifdef HAVE___GMPN_REDC_2 t_sqrredc_2 = cputime(); for (i = 0; i < iter; ++i) { mpn_sqr(tmp, x, N); __gmpn_redc_2 (z, tmp, m, N, invm); x[0] += tmp[0]; } t_sqrredc_2 = cputime()-t_sqrredc_2; if (t_sqrredc_2 < tsqr_best) { tune_sqr[N] = MPMOD_MUL_REDC2; tsqr_best = t_sqrredc_2; } #endif /* Sqr followed by mpn_redc_n */ #ifdef HAVE___GMPN_REDC_N t_sqrredc_n = cputime(); for (i = 0; i < iter; ++i) { mpn_sqr (tmp, x, N); __gmpn_redc_n (z, tmp, m, N, invm); } t_sqrredc_n = cputime()-t_sqrredc_n; if (t_sqrredc_n < tsqr_best) { tune_sqr[N] = MPMOD_MUL_REDCN; tsqr_best = t_sqrredc_n; } #endif #ifdef HAVE_NATIVE_MULREDC1_N /* mulredc1 */ t3 = cputime(); switch (N) { case 1: for (i=0; i 1) { fprintf (stderr, "svoboda1 = %.3f", (double) tsvoboda1 * 1e3 / (double) iter); if (tsvoboda1 == tredc_best) fprintf (stderr, " *"); fprintf (stderr, "\n"); } #ifdef HAVE___GMPN_REDC_2 fprintf (stderr, "mpn_redc_2 = %.3f", (double) tredc_2 * 1e3 / (double) iter); if (tredc_2 == tredc_best) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_N fprintf (stderr, "mpn_redc_n = %.3f", (double) tredc_n * 1e3 / (double) iter); if (tredc_n == tredc_best) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif fprintf (stderr, "\n"); /* modular multiplication */ #ifdef USE_ASM_REDC fprintf (stderr, "mulredc = %.3f", (double) t2 * 1e3 / (double) iter); if (tmul_best == t2) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_1 fprintf (stderr, "mul+redc_1 = %.3f", (double) t_mulredc_1 * 1e3 / (double) iter); if (tmul_best == t_mulredc_1) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_2 fprintf (stderr, "mul+redc_2 = %.3f", (double) t_mulredc_2 * 1e3 / (double) iter); if (tmul_best == t_mulredc_2) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_N fprintf (stderr, "mul+redc_n = %.3f", (double) t_mulredc_n * 1e3 / (double) iter); if (tmul_best == t_mulredc_n) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif fprintf (stderr, "\n"); /* modular squaring */ #ifdef USE_ASM_REDC fprintf (stderr, "mulredc = %.3f", (double) t2 * 1e3 / (double) iter); if (tsqr_best == t2) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_1 fprintf (stderr, "sqr+redc_1 = %.3f", (double) t_sqrredc_1 * 1e3 / (double) iter); if (tsqr_best == t_sqrredc_1) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_2 fprintf (stderr, "sqr+redc_2 = %.3f", (double) t_sqrredc_2 * 1e3 / (double) iter); if (tsqr_best == t_sqrredc_2) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE___GMPN_REDC_N fprintf (stderr, "sqr+redc_n = %.3f", (double) t_sqrredc_n * 1e3 / (double) iter); if (tsqr_best == t_sqrredc_n) fprintf (stderr, " *"); fprintf (stderr, "\n"); #endif #ifdef HAVE_NATIVE_MULREDC1_N /* multiplication of n limbs by one limb */ fprintf (stderr, "mulredc1 = %.3f\n", (double) t3 * 1e3 / (double) LOOPCOUNT); #endif fflush (stderr); free (tmp); free (x); free (y); free (z); free (m); free (invm); free (svoboda1); } int main(int argc, char** argv) { int i; int minsize = 1, maxsize = MAXSIZE; if (argc > 1) minsize = atoi (argv[1]); if (argc > 2) maxsize = atoi (argv[2]); for (i = minsize; i <= maxsize; ++i) bench(i); printf ("/* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */\n"); printf ("#define TUNE_MULREDC_TABLE {0"); for (i = 1; i <= maxsize; i++) printf (",%d", tune_mul[i]); printf ("}\n"); printf ("/* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */\n"); printf ("#define TUNE_SQRREDC_TABLE {0"); for (i = 1; i <= maxsize; i++) printf (",%d", tune_sqr[i]); printf ("}\n"); fflush (stdout); return 0; } ecm-6.4.4/ellparam_batch.c0000644023561000001540000001536012106741273012325 00000000000000/* ellparam_batch.c - Parametrization for batch mode 2 Copyright 2012 Cyril Bouvier. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "ecm-gmp.h" #include "ecm-impl.h" #if 0 /* this function is useful in debug mode to print residues */ static void mpres_print (mpres_t x, char* name, mpmod_t n) { mp_size_t m, xn; mpres_t t; mpres_init(t, n); mpz_set_ui(t, 1); mpres_mul (t, x, t, n); xn = SIZ(t); m = ABSIZ(t); MPN_NORMALIZE(PTR(t), m); SIZ(t) = xn >= 0 ? m : -m; gmp_printf ("%s=%Zd\n", name, t); SIZ(t) = xn; mpres_clear (t, n); } #endif static void dbl_param (mpres_t x, mpres_t y, mpres_t z, mpres_t t, mpres_t u, mpres_t v, mpmod_t n) { mpres_mul (z, y, z, n); /* Y1*Z1 */ mpres_mul_ui (z, z, 2, n); /* Z3 = 2*Y1*Z1 */ mpres_sqr (u, x, n); /* A = X1*X1 */ mpres_sqr (t, y, n); /* B = Y1*Y1 */ mpres_sqr (y, t, n); /* C = B^2 */ mpres_add (v, x, t, n); /* X1+B */ mpres_sqr (v, v, n); /* (X1+B)^2 */ mpres_sub (v, v, u, n); /* (X1+B)^2-A */ mpres_sub (v, v, y, n); /* (X1+B)^2-A-C */ mpres_mul_ui (v, v, 2, n); /* D = 2*((X1+B)^2-A-C) */ mpres_mul_ui (u, u, 3, n); /* E = 3*A */ mpres_sqr (t, u, n); /* F = E^2 */ mpres_mul_ui (x, v, 2, n); /* 2*D */ mpres_sub (x, t, x, n); /* X3 = F-2*D */ mpres_sub (v, v, x, n); /* D-X3 */ mpres_mul_ui (y, y, 8, n); /* 8*C */ mpres_mul (t, u, v, n); /* E*(D-X3) */ mpres_sub (y, t, y, n); /* Y3 = E*(D-X3)-8*C */ } /*Add sgn*P=(-3:sgn*3:1) to Q=(x:y:z) */ static void add_param (mpres_t x, mpres_t y, mpres_t z, int sgn, mpres_t t, mpres_t u, mpres_t v, mpres_t w, mpmod_t n) { mpres_sqr (t, z, n); /* Z1Z1 = Z1^2 */ mpres_mul_ui (u, t, 3, n); mpres_neg (u, u, n); /* U2 = X2*Z1Z1 with X2=-3 */ mpres_mul (v, z, t, n); /* Z1*Z1Z1 */ mpres_mul_ui (v, v, 3, n); /* S2 = Y2*Z1*Z1Z1 with Y2=sgn*3 */ if (sgn == -1) mpres_neg (v, v, n); /* S2 = Y2*Z1*Z1Z1 with Y2=sgn*3 */ mpres_sub (u, u, x, n); /* H = U2-X1 */ mpres_sqr (w, u, n); /* HH = H^2 */ mpres_add (z, z, u, n); /* Z1+H */ mpres_sqr (z, z, n); /* (Z1+H)^2 */ mpres_sub (z, z, t, n); /* (Z1+H)^2-Z1Z1 */ mpres_sub (z, z, w, n); /* Z3 = (Z1+H)^2-Z1Z1-HH */ mpres_mul_ui (t, w, 4, n); /* I = 4*HH */ mpres_mul (u, u, t, n); /* J = H*I */ mpres_sub (v, v, y, n); /* S2-Y1 */ mpres_mul_ui (v, v, 2, n); /* r = 2*(S2-Y1) */ mpres_mul (t, x, t, n); /* V = X1*I */ mpres_sqr (x, v, n); /* r^2 */ mpres_mul_ui (w, t, 2, n); /* 2*V */ mpres_sub (x, x, u, n); /* r^2-J */ mpres_sub (x, x, w, n); /* X3 = r^2-J-2*V */ mpres_sub (w, t, x, n); /* V-X3 */ mpres_mul (y, y, u, n); /* Y1*J */ mpres_mul_ui (y, y, 2, n); /* 2*Y1*J */ mpres_mul (w, v, w, n); /* r*(V-X3) */ mpres_sub (y, w, y, n); /* Y3=r*(V-X3)-2*Y1*J */ } static void addchain_param (mpres_t x, mpres_t y, mpres_t z, unsigned int s, mpres_t t, mpres_t u, mpres_t v, mpres_t w, mpmod_t n) { if (s == 1) { mpres_set_si (x, -3, n); mpres_set_ui (y, 3, n); mpres_set_ui (z, 1, n); } else if (s == 3) { addchain_param(x, y, z, s-1, t, u, v, w, n); add_param (x, y, z, +1, t, u, v, w, n); } else if (s % 2 == 0) { addchain_param(x, y, z, s/2, t, u, v, w, n); dbl_param (x, y, z, t, u, v, n); } else if (s % 4 == 1) { addchain_param(x, y, z, s-1, t, u, v, w, n); add_param (x, y, z, +1, t, u, v, w, n); } else /* (s % 4 == 3) and s != 3 */ { addchain_param(x, y, z, s+1, t, u, v, w, n); add_param (x, y, z, -1, t, u, v, w, n); } } /*Parametrization for BATCHMODE 2: generate curves with a point of order 3 and starting point (2:1) Compute k*P on y^2=x^3+36 with P=(-3,3); need k>1 x3 = (3*x+y+6)/(2*(y-3)) and A=-(3*x3^4+6*x3^2-1)/(4*x3^3)*/ int get_curve_from_ell_parametrization (mpz_t f, mpres_t A, mpz_t k, mpmod_t n) { mpres_t t, u, v, w, x, y, z; unsigned int s; MEMORY_TAG; mpres_init (t, n); MEMORY_TAG; mpres_init (u, n); MEMORY_TAG; mpres_init (v, n); MEMORY_TAG; mpres_init (w, n); MEMORY_TAG; mpres_init (x, n); MEMORY_TAG; mpres_init (y, n); MEMORY_TAG; mpres_init (z, n); MEMORY_UNTAG; s = mpz_get_ui (k); addchain_param (x, y, z, s, t, u, v, w, n); /* Now (x:y:z) = k*P */ if (!mpres_invert(u, z, n)) { mpres_gcd (f, z, n); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (w, n); mpres_clear (x, n); mpres_clear (y, n); mpres_clear (z, n); return ECM_FACTOR_FOUND_STEP1; } mpres_sqr (v, u, n); mpres_mul (u, v, u, n); mpres_mul (x, x, v, n); mpres_mul (y, y, u, n); mpres_sub_ui (t, y, 3, n); mpres_mul_ui (t, t, 2, n); if (!mpres_invert(u, t, n)) { mpres_gcd (f, t, n); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (w, n); mpres_clear (x, n); mpres_clear (y, n); mpres_clear (z, n); return ECM_FACTOR_FOUND_STEP1; } mpres_mul_ui (w, x, 3, n); mpres_add (w, w, y, n); mpres_add_ui (w, w, 6, n); mpres_mul (x, w, u, n); /* Now x contains x_3 */ /* A=-(3*x3^4+6*x3^2-1)/(4*x3^3) */ mpres_sqr (u, x, n); mpres_mul (v, u, x, n); mpres_sqr (w, u, n); mpres_mul_ui (u, u, 6, n); mpres_neg (u, u, n); mpres_mul_ui (v, v, 4, n); mpres_mul_ui (w, w, 3, n); mpres_neg (w, w, n); if (!mpres_invert(t, v, n)) { mpres_gcd (f, v, n); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (w, n); mpres_clear (x, n); mpres_clear (y, n); mpres_clear (z, n); return ECM_FACTOR_FOUND_STEP1; } mpres_add (w, w, u, n); mpres_add_ui (w, w, 1, n); mpres_mul (A, w, t, n); mpz_mod (A, A, n->orig_modulus); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (w, n); mpres_clear (x, n); mpres_clear (y, n); mpres_clear (z, n); return ECM_NO_FACTOR_FOUND; } ecm-6.4.4/ecm-params.h.sparc640000644023561000001540000000162612106741273012702 00000000000000/* those parameters were obtained on gcc54.fsffrance.org with ecm-6.4.1-rc3 gmp-5.0.2, and gcc 4.3.2 -O2 -pedantic -m64 -mptr64 -mcpu=ultrasparc (sparc64-unknown-linux-gnu) */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,1,1,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} #define MPZMOD_THRESHOLD 104 #define REDC_THRESHOLD 341 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 11, 13, 13, 13, 15, 14, 15, 16, 17, 16, 17, 16, 19, 19, 19, 20, 21, 22} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 262144 #define PREREVERTDIVISION_NTT_THRESHOLD 262144 #define POLYINVERT_NTT_THRESHOLD 262144 #define POLYEVALT_NTT_THRESHOLD 262144 #define MPZSPV_NORMALISE_STRIDE 64 ecm-6.4.4/ecm-ecm.h0000644023561000001540000002035012106741273010675 00000000000000/* ecm-ecm.h - private header file for GMP-ECM. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Paul Zimmermann, Alexander Kruppa and Cyril Bouvier. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef _ECM_ECM_H #define _ECM_ECM_H 1 #include "config.h" #include #define ASSERT_ALWAYS(expr) assert (expr) #ifdef WANT_ASSERT #define ASSERT(expr) assert (expr) #else #define ASSERT(expr) do {} while (0) #endif #include "ecm.h" /* Structure for candidate usage. This is much more powerful than using a simple mpz_t to hold the candidate. This structure also houses the expression (in raw form), and will modify the expression as factors are found (if in looping modes). Also, since we are warehousing all of the data associated with the candidate, we also store whether the candidate is PRP here (so testing will cease), along with the length of the candidate. As each factor is found, the candidate will also have the factor removed from it */ typedef struct { #if defined (CANDI_DEBUG) unsigned long magic; /* used for debugging purposes while writing this code */ #endif char *cpExpr; /* if non-NULL, then this is a "simpler" expression than the decimal output of n */ mpz_t n; /* the cofactor candidate currently being used to find factors from */ unsigned ndigits; /* the number of digits (decimal) in n */ unsigned nexprlen; /* strlen of expression, 0 if there is NO expression */ int isPrp; /* usually 0, but turns 1 if factor found, and the cofactor is PRP, OR if the original candidate was PRP and the user asked to prp check */ } mpcandi_t; typedef struct { int Valid; /* Is ONLY set to 1 if there is a proper -go switch. Otherwise is 0 and if 0, then PM1, PP1 and ECM all ignore it */ char *cpOrigExpr; /* if non-NULL, then this is a "simpler" expression than the decimal output of n */ mpcandi_t Candi; /* The value after expression checked */ int containsN; /* 0 for simple number or expression. 1 if the expression "contains" N as that expression will have to be built for each candidate */ } mpgocandi_t; /* auxi.c */ unsigned int nb_digits (const mpz_t); int probab_prime_p (mpz_t, int); int read_number (mpcandi_t*, FILE*, int); /* Various logging levels */ /* OUTPUT_ALWAYS means print always, regardless of verbose value */ #define OUTPUT_ALWAYS 0 /* OUTPUT_NORMAL means print during normal program execution */ #define OUTPUT_NORMAL 1 /* OUTPUT_VERBOSE means print if the user requested more verbosity */ #define OUTPUT_VERBOSE 2 /* OUTPUT_RESVERBOSE is for printing residues (after stage 1 etc) */ #define OUTPUT_RESVERBOSE 3 /* OUTPUT_DEVVERBOSE is for printing internal parameters (for developers) */ #define OUTPUT_DEVVERBOSE 4 /* OUTPUT_TRACE is for printing trace data, produces lots of output */ #define OUTPUT_TRACE 5 /* OUTPUT_ERROR is for printing error messages */ #define OUTPUT_ERROR -1 #define MAX_NUMBER_PRINT_LEN 1000 #define NTT_SIZE_THRESHOLD 30 /* auxlib.c */ int test_verbose (int); void set_verbose (int); int inc_verbose (); /* Return codes */ /* Bit coded values: 1: error (for example out of memory) 2: proper factor found, 4: factor is prime, 8: cofactor is prime or 1 */ #define ECM_EXIT_ERROR 1 #define ECM_COMP_FAC_COMP_COFAC 2 #define ECM_PRIME_FAC_COMP_COFAC (2+4) #define ECM_INPUT_NUMBER_FOUND 8 #define ECM_COMP_FAC_PRIME_COFAC (2+8) #define ECM_PRIME_FAC_PRIME_COFAC (2+4+8) /* getprime.c */ double getprime (); void getprime_clear (); void getprime_seek (double); #define WANT_FREE_PRIME_TABLE(p) (p < 0.0) #define FREE_PRIME_TABLE -1.0 /* b1_ainc.c */ double calc_B1_AutoIncrement(double cur_B1, double incB1val, int calcInc); /* memory.c */ #ifdef MEMORY_DEBUG void __gmp_default_free (void *, size_t); void *__gmp_default_allocate (size_t); void *__gmp_default_reallocate (void *, size_t, size_t); void tests_memory_start (void); void tests_memory_end (void); void tests_memory_reset (void); void tests_free (void *, size_t); void tests_memory_status (void); void tests_memory_set_location (char *, unsigned int); #endif /* trial.c */ int trial_factor (mpcandi_t *n, double maxfact, int deep); /* resume.c */ int read_resumefile_line (int *, mpz_t, mpcandi_t *, mpz_t, mpz_t, mpz_t, double *, char *, char *, char *, char *, FILE *); int write_resumefile_line (char *, int, double, mpz_t, mpz_t, mpz_t, mpcandi_t *, mpz_t, const char *); /* main.c */ int kbnc_z (double *k, unsigned long *b, unsigned long *n, signed long *c, mpz_t z); int kbnc_str (double *k, unsigned long *b, unsigned long *n, signed long *c, char *z, mpz_t num); /* batch.c */ void compute_s (mpz_t, unsigned long); int write_s_in_file (char *, mpz_t); void read_s_from_file (mpz_t, char *); /* eval.c */ int eval (mpcandi_t *n, FILE *fd, int bPrp); int eval_str (mpcandi_t *n, char *cp, int primetest, char **EndChar); /* EndChar can be NULL */ void init_expr (void); void free_expr (void); /* candi.c */ void mpcandi_t_init (mpcandi_t *n); /* damn, a C++ class sure would have been nice :( */ void mpcandi_t_free (mpcandi_t *n); int mpcandi_t_copy (mpcandi_t *to, mpcandi_t *from); int mpcandi_t_add_candidate (mpcandi_t *n, mpz_t c, const char *cpExpr, int bPrp); int mpcandi_t_addfoundfactor (mpcandi_t *n, mpz_t f, int displaywarning); int mpcandi_t_addfoundfactor_d (mpcandi_t *n, double f); /* candi.c Group Order candidate functions. */ void mpgocandi_t_init(mpgocandi_t *go); void mpgocandi_t_free(mpgocandi_t *go); int mpgocandi_fixup_with_N(mpgocandi_t *go, mpcandi_t *n); /* random.c */ unsigned long get_random_ul (void); /* random2.c */ void pp1_random_seed (mpz_t, mpz_t, gmp_randstate_t); void pm1_random_seed (mpz_t, mpz_t, gmp_randstate_t); /* default number of probable prime tests */ #define PROBAB_PRIME_TESTS 1 /* maximal stage 1 bound = 2^53 - 1, the next prime being 2^53 + 5 */ #define MAX_B1 9007199254740991.0 /* The checksum for savefile is the product of all mandatory fields, modulo the greatest prime below 2^32 */ #define CHKSUMMOD 4294967291U #ifdef MEMORY_DEBUG #define FREE(ptr,size) tests_free(ptr,size) #define MEMORY_TAG tests_memory_set_location(__FILE__,__LINE__) #define MEMORY_UNTAG tests_memory_set_location("",0) #define MPZ_INIT(x) {MEMORY_TAG;mpz_init(x);MEMORY_UNTAG;} #define MPZ_INIT2(x,n) {MEMORY_TAG;mpz_init2(x,n);MEMORY_UNTAG;} #else #define FREE(ptr,size) free(ptr) #define MEMORY_TAG do{}while(0) #define MEMORY_UNTAG do{}while(0) #define MPZ_INIT(x) mpz_init(x) #define MPZ_INIT2(x,n) mpz_init2(x,n) #endif #define ABS(x) ((x) >= 0 ? (x) : -(x)) /* could go in auxi.c as a function */ #ifdef HAVE_SETPRIORITY # include # ifdef HAVE_SYS_RESOURCE_H # include # endif # define NICE10 setpriority (PRIO_PROCESS, 0, 10) # define NICE20 setpriority (PRIO_PROCESS, 0, 20) #elif defined(HAVE_NICE) # ifdef HAVE_UNISTD_H # include # endif # define NICE10 nice (10) # define NICE20 nice (20) #elif defined(HAVE_WINDOWS_H) # include # define NICE10 do { \ SetPriorityClass (GetCurrentProcess (), BELOW_NORMAL_PRIORITY_CLASS); \ SetThreadPriority (GetCurrentThread (), THREAD_PRIORITY_BELOW_NORMAL); \ } while (0) # define NICE20 do { \ SetPriorityClass (GetCurrentProcess (), IDLE_PRIORITY_CLASS); \ SetThreadPriority (GetCurrentThread (), THREAD_PRIORITY_IDLE); \ } while (0) #else # warning "Can't find a way to change priority" # define NICE10 do {} while (0) # define NICE20 do {} while (0) #endif #endif /* _ECM_ECM_H */ ecm-6.4.4/ecm-params.h.alpha-ev560000644023561000001540000000107412106741273013265 00000000000000/* parameters kindly provided by Torbjorn Granlund, and produced with ecm-6.3-rc3 on alphaev56-unknown-freebsd6.4 */ #define MPZMOD_THRESHOLD 58 #define REDC_THRESHOLD 493 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 4096 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 128 ecm-6.4.4/ecm.10000644023561000001540000003670712113421230010043 00000000000000'\" t .\" Title: ECM .\" Author: [see the "AUTHORS" section] .\" Generator: DocBook XSL Stylesheets v1.76.1 .\" Date: 02/27/2013 .\" Manual: April 22, 2003 .\" Source: April 22, 2003 .\" Language: English .\" .TH "ECM" "1" "02/27/2013" "April 22, 2003" "April 22, 2003" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .\" http://bugs.debian.org/507673 .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) .ad l .\" ----------------------------------------------------------------- .\" * MAIN CONTENT STARTS HERE * .\" ----------------------------------------------------------------- .SH "NAME" ecm \- integer factorization using ECM, P\-1 or P+1 .SH "SYNOPSIS" .HP \w'\fBecm\fR\ 'u \fBecm\fR [\fBoptions\fR] \fIB1\fR [\fIB2min\fR\-\fIB2max\fR | \fIB2\fR] .br .SH "DESCRIPTION" .PP ecm is an integer factoring program using the Elliptic Curve Method (ECM), the P\-1 method, or the P+1 method\&. The following sections describe parameters relevant to these algorithms\&. .SH "STEP 1 AND STEP 2 BOUND PARAMETERS" .PP \fB\fIB1\fR\fR .RS 4 \fIB1\fR is the step 1 bound\&. It is a mandatory parameter\&. It can be given either in integer format (for example 3000000) or in floating\-point format (3000000\&.0 or 3e6)\&. The largest possible \fIB1\fR value is 9007199254740996 for P\-1, and ULONG_MAX or 9007199254740996 (whichever is smaller) for ECM and P+1\&. All primes 2 <= p <= \fIB1\fR are processed in step 1\&. .RE .PP \fB\fIB2\fR\fR .RS 4 \fIB2\fR is the step 2 bound\&. It is optional: if omitted, a default value is computed from \fIB1\fR, which should be close to optimal\&. Like \fIB1\fR, it can be given either in integer or in floating\-point format\&. The largest possible value of \fIB2\fR is approximately 9e23, but depends on the number of blocks \fIk\fR if you specify the \fB\-k\fR option\&. All primes \fIB1\fR <= p <= \fIB2\fR are processed in step 2\&. If \fIB2\fR < \fIB1\fR, no step 2 is performed\&. .RE .PP \fB\fIB2min\fR\fR\fB\-\fR\fB\fIB2max\fR\fR .RS 4 alternatively one may use the \fIB2min\fR\-\fIB2max\fR form, which means that all primes \fIB2min\fR <= p <= \fIB2max\fR should be processed\&. Thus specifying \fIB2\fR only corresponds to \fIB1\fR\-\fIB2\fR\&. The values of \fIB2min\fR and \fIB2max\fR may be arbitrarily large, but their difference must not exceed approximately 9e23, subject to the number of blocks \fIk\fR\&. .RE .SH "FACTORING METHOD" .PP \fB\-pm1\fR .RS 4 Perform P\-1 instead of the default method (ECM)\&. .RE .PP \fB\-pp1\fR .RS 4 Perform P+1 instead of the default method (ECM)\&. .RE .SH "GROUP AND INITIAL POINT PARAMETERS" .PP \fB\-x0 \fR\fB\fIx\fR\fR .RS 4 [ECM, P\-1, P+1] Use \fIx\fR (arbitrary\-precision integer or rational) as initial point\&. For example, \fB\-x0 1/3\fR is valid\&. If not given, \fIx\fR is generated from the sigma value for ECM, or at random for P\-1 and P+1\&. .RE .PP \fB\-sigma \fR\fB\fIs\fR\fR .RS 4 [ECM] Use \fIs\fR (arbitrary\-precision integer) as curve generator\&. If omitted, \fIs\fR is generated at random\&. .RE .PP \fB\-A \fR\fB\fIa\fR\fR .RS 4 [ECM] Use \fIa\fR (arbitrary\-precision integer) as curve parameter\&. If omitted, is it generated from the sigma value\&. .RE .PP \fB\-go \fR\fB\fIval\fR\fR .RS 4 [ECM, P\-1, P+1] Multiply the initial point by \fIval\fR, which can any valid expression, possibly containing the special character N as place holder for the current input number\&. Example: .sp .if n \{\ .RS 4 .\} .nf ecm \-pp1 \-go "N^2\-1" 1e6 < composite2000 .fi .if n \{\ .RE .\} .sp .RE .SH "STEP 2 PARAMETERS" .PP \fB\-k \fR\fB\fIk\fR\fR .RS 4 [ECM, P\-1, P+1] Perform \fIk\fR blocks in step 2\&. For a given \fIB2\fR value, increasing \fIk\fR decreases the memory usage of step 2, at the expense of more cpu time\&. .RE .PP \fB\-treefile \fR\fB\fIfile\fR\fR .RS 4 Stores some tables of data in disk files to reduce the amount of memory occupied in step 2, at the expense of disk I/O\&. Data will be written to files \fIfile\fR\&.1, \fIfile\fR\&.2 etc\&. Does not work with fast stage 2 for P+1 and P\-1\&. .RE .PP \fB\-power \fR\fB\fIn\fR\fR .RS 4 [ECM, P\-1] Use x^\fIn\fR for Brent\-Suyama\*(Aqs extension (\fB\-power 1\fR disables Brent\-Suyama\*(Aqs extension)\&. The default polynomial is chosen depending on the method and B2\&. For P\-1 and P+1, disables the fast stage 2\&. For P\-1, \fIn\fR must be even\&. .RE .PP \fB\-dickson \fR\fB\fIn\fR\fR .RS 4 [ECM, P\-1] Use degree\-\fIn\fR Dickson\*(Aqs polynomial for Brent\-Suyama\*(Aqs extension\&. For P\-1 and P+1, disables the fast stage 2\&. Like for \fB\-power\fR, \fIn\fR must be even for P\-1\&. .RE .PP \fB\-maxmem \fR\fB\fIn\fR\fR .RS 4 Use at most \fIn\fR megabytes of memory in stage 2\&. .RE .PP \fB\-ntt\fR, \fB\-no\-ntt\fR .RS 4 Enable or disable the Number\-Theoretic Transform code for polynomial arithmetic in stage 2\&. With NTT, dF is chosen to be a power of 2, and is limited by the number suitable primes that fit in a machine word (which is a limitation only on 32 bit systems)\&. The \-no\-ntt variant uses more memory, but is faster than NTT with large input numbers\&. By default, NTT is used for P\-1, P+1 and for ECM on numbers of size at most 30 machine words\&. .RE .SH "OUTPUT" .PP \fB\-q\fR .RS 4 Quiet mode\&. Found factorizations are printed on standard output, with factors separated by white spaces, one line per input number (if no factor was found, the input number is simply copied)\&. .RE .PP \fB\-v\fR .RS 4 Verbose mode\&. More information is printed, more \fB\-v\fR options increase verbosity\&. With one \fB\-v\fR, the kind of modular multiplication used, initial x0 value, step 2 parameters and progress, and expected curves and time to find factors of different sizes for ECM are printed\&. With \fB\-v \-v\fR, the A value for ECM and residues at the end of step 1 and step 2 are printed\&. More \fB\-v\fR print internal data for debugging\&. .RE .PP \fB\-timestamp\fR .RS 4 Print a time stamp whenever a new ECM curve or P+1 or P\-1 run is processed\&. .RE .SH "MODULAR ARITHMETIC OPTIONS" .PP Several algorithms are available for modular multiplication\&. The program tries to find the best one for each input; one can force a given method with the following options\&. .PP \fB\-mpzmod\fR .RS 4 Use GMP\*(Aqs mpz_mod function (sub\-quadratic for large inputs, but induces some overhead for small ones)\&. .RE .PP \fB\-modmuln\fR .RS 4 Use Montgomery\*(Aqs multiplication (quadratic version)\&. Usually best method for small input\&. .RE .PP \fB\-redc\fR .RS 4 Use Montgomery\*(Aqs multiplication (sub\-quadratic version)\&. Theoretically optimal for large input\&. .RE .PP \fB\-nobase2\fR .RS 4 Disable special base\-2 code (which is used when the input number is a large factor of 2^n+1 or 2^n\-1, see \fB\-v\fR)\&. .RE .PP \fB\-base2\fR \fIn\fR .RS 4 Force use of special base\-2 code, input number must divide 2^\fIn\fR+1 if \fIn\fR > 0, or 2^|\fIn\fR|\-1 if \fIn\fR < 0\&. .RE .SH "FILE I/O" .PP The following options enable one to perform step 1 and step 2 separately, either on different machines, at different times, or using different software (in particular, George Woltman\*(Aqs Prime95/mprime program can produce step 1 output suitable for resuming with GMP\-ECM)\&. It can also be useful to split step 2 into several runs, using the \fIB2min\-B2max\fR option\&. .PP \fB\-inp \fR\fB\fIfile\fR\fR .RS 4 Take input from file \fIfile\fR instead of from standard input\&. .RE .PP \fB\-save \fR\fB\fIfile\fR\fR .RS 4 Save result of step 1 in \fIfile\fR\&. If \fIfile\fR exists, an error is raised\&. Example: to perform only step 1 with \fIB1\fR=1000000 on the composite number in the file "c155" and save its result in file "foo", use .sp .if n \{\ .RS 4 .\} .nf ecm \-save foo 1e6 1 < c155 .fi .if n \{\ .RE .\} .sp .RE .PP \fB\-savea \fR\fB\fIfile\fR\fR .RS 4 Like \fB\-save\fR, but appends to existing files\&. .RE .PP \fB\-resume \fR\fB\fIfile\fR\fR .RS 4 Resume residues from \fIfile\fR, reads from standard input if \fIfile\fR is "\-"\&. Example: to perform step 2 following the above step 1 computation, use .sp .if n \{\ .RS 4 .\} .nf ecm \-resume foo 1e6 .fi .if n \{\ .RE .\} .sp .RE .PP \fB\-chkpoint \fR\fB\fIfile\fR\fR .RS 4 Periodically write the current residue in stage 1 to \fIfile\fR\&. In case of a power failure, etc\&., the computation can be continued with the \fB\-resume\fR option\&. .sp .if n \{\ .RS 4 .\} .nf ecm \-chkpnt foo \-pm1 1e10 < largenumber\&.txt .fi .if n \{\ .RE .\} .sp .RE .SH "LOOP MODE" .PP The \(lqloop mode\(rq (option \fB\-c \fR\fB\fIn\fR\fR) enables one to run several curves on each input number\&. The following options control its behavior\&. .PP \fB\-c \fR\fB\fIn\fR\fR .RS 4 Perform \fIn\fR runs on each input number (default is one)\&. This option is mainly useful for P+1 (for example with \fIn\fR=3) or for ECM, where \fIn\fR could be set to the expected number of curves to find a d\-digit factor with a given step 1 bound\&. This option is incompatible with \fB\-resume, \-sigma, \-x0\fR\&. Giving \fB\-c 0\fR produces an infinite loop until a factor is found\&. .RE .PP \fB\-one\fR .RS 4 In loop mode, stop when a factor is found; the default is to continue until the cofactor is prime or the specified number of runs are done\&. .RE .PP \fB\-b\fR .RS 4 Breadth\-first processing: in loop mode, run one curve for each input number, then a second curve for each one, and so on\&. This is the default mode with \fB\-inp\fR\&. .RE .PP \fB\-d\fR .RS 4 Depth\-first processing: in loop mode, run \fIn\fR curves for the first number, then \fIn\fR curves for the second one and so on\&. This is the default mode with standard input\&. .RE .PP \fB\-ve \fR\fB\fIn\fR\fR .RS 4 In loop mode, in the second and following runs, output only expressions that have at most \fIn\fR characters\&. Default is \fB\-ve 0\fR\&. .RE .PP \fB\-i \fR\fB\fIn\fR\fR .RS 4 In loop mode, increment \fIB1\fR by \fIn\fR after each curve\&. .RE .PP \fB\-I \fR\fB\fIn\fR\fR .RS 4 In loop mode, multiply \fIB1\fR by a factor depending on \fIn\fR after each curve\&. Default is one which should be optimal on one machine, while \fB\-I 10\fR could be used when trying to factor the same number simultaneously on 10 identical machines\&. .RE .SH "SHELL COMMAND EXECUTION" .PP These optins allow for executing shell commands to supplement functionality to GMP\-ECM\&. .PP \fB\-prpcmd \fR\fB\fIcmd\fR\fR .RS 4 Execute command \fIcmd\fR to test primality if factors and cofactors instead of GMP\-ECM\*(Aqs own functions\&. The number to test is passed via stdin\&. An exit code of 0 is interpreted as \(lqprobably prime\(rq, a non\-zero exit code as \(lqcomposite\(rq\&. .RE .PP \fB\-faccmd \fR\fB\fIcmd\fR\fR .RS 4 Executes command \fIcmd\fR whenever a factor is found by P\-1, P+1 or ECM\&. The input number, factor and cofactor are passed via stdin, each on a line\&. This could be used i\&.e\&. to mail new factors automatically: .sp .if n \{\ .RS 4 .\} .nf ecm \-faccmd \*(Aqmail \-s \(lq$HOSTNAME found a factor\(rq me@myaddress\&.com\*(Aq 11e6 < cunningham\&.in .fi .if n \{\ .RE .\} .sp .RE .PP \fB\-idlecmd \fR\fB\fIcmd\fR\fR .RS 4 Executes command \fIcmd\fR before each ECM curve, P\-1 or P+1 attempt on a number is started\&. If the exit status of \fIcmd\fR is non\-zero, GMP\-ECM terminates immediately, otherwise it continues normally\&. GMP\-ECM is stopped while \fIcmd\fR runs, offering a way for letting GMP\-ECM sleep for example while the system is otherwise busy\&. .RE .SH "MISCELLANEOUS" .PP \fB\-n\fR .RS 4 Run the program in \(lqnice\(rq mode (below normal priority)\&. .RE .PP \fB\-nn\fR .RS 4 Run the program in \(lqvery nice\(rq mode (idle priority)\&. .RE .PP \fB\-B2scale \fR\fB\fIf\fR\fR .RS 4 Multiply the default step 2 bound \fIB2\fR by the floating\-point value \fIf\fR\&. Example: \fB\-B2scale 0\&.5\fR divides the default \fIB2\fR by 2\&. .RE .PP \fB\-stage1time \fR\fB\fIn\fR\fR .RS 4 Add \fIn\fR seconds to stage 1 time\&. This is useful to get correct expected time with \fI\-v\fR if part of stage 1 was done in another run\&. .RE .PP \fB\-cofdec\fR .RS 4 Force cofactor output in decimal (even if expressions are used)\&. .RE .PP \fB\-h\fR, \fB\-\-help\fR .RS 4 Display a short description of ecm usage, parameters and command line options\&. .RE .PP \fB\-printconfig\fR .RS 4 Prints configuration parameters used for the compilation and exits\&. .RE .SH "INPUT SYNTAX" .PP The input numbers can have several forms: .PP Raw decimal numbers like 123456789\&. .PP Comments can be placed in the file: everything after \(lq//\(rq is ignored, up to the end of line\&. .PP Line continuation\&. If a line ends with a backslash character \(lq\e\(rq, it is considered to continue on the next line\&. .PP Common arithmetic expressions can be used\&. Example: \fI3*5+2^10\fR\&. .PP Factorial: example \fI53!\fR\&. .PP Multi\-factorial: example \fI15!3\fR means 15*12*9*6*3\&. .PP Primorial: example \fI11#\fR means 2*3*5*7*11\&. .PP Reduced primorial: example \fI17#5\fR means 5*7*11*13*17\&. .PP Functions: currently, the only available function is \fIPhi(x,n)\fR\&. .SH "EXIT STATUS" .PP The exit status reflects the result of the last ECM curve or P\-1/P+1 attempt the program performed\&. Individual bits signify particular events, specifically: .PP Bit 0 .RS 4 0 if normal program termination, 1 if error occured .RE .PP Bit 1 .RS 4 0 if no proper factor was found, 1 otherwise .RE .PP Bit 2 .RS 4 0 if factor is composite, 1 if factor is a probable prime .RE .PP Bit 3 .RS 4 0 if cofactor is composite, 1 if cofactor is a probable prime .RE .PP Thus, the following exit status values may occur: .PP 0 .RS 4 Normal program termination, no factor found .RE .PP 1 .RS 4 Error .RE .PP 2 .RS 4 Composite factor found, cofactor is composite .RE .PP 6 .RS 4 Probable prime factor found, cofactor is composite .RE .PP 8 .RS 4 Input number found .RE .PP 10 .RS 4 Composite factor found, cofactor is a probable prime .RE .PP 14 .RS 4 Probable prime factor found, cofactor is a probable prime .RE .SH "BUGS" .PP Report bugs to , after checking for bug fixes or new versions\&. .SH "AUTHORS" .PP Pierrick Gaudry contributed efficient assembly code for combined mul/redc; .PP Jim Fougeron contributed the expression parser and several command\-line options; .PP Laurent Fousse contributed the middle product code, the autoconf/automake tools, and is the maintainer of the Debian package; .PP Alexander Kruppa <(lastname)al@loria\&.fr> contributed estimates for probability of success for ECM, the new P+1 and P\-1 stage 2 (with P\&. L\&. Montgomery), new AMD64 asm mulredc code, and some other things; .PP Dave Newman contributed the Kronecker\-Schoenhage and NTT multiplication code; .PP Jason S\&. Papadopoulos contributed a speedup of the NTT code .PP Paul Zimmermann is the author of the first version of the program and chief maintainer of GMP\-ECM\&. .PP Note: email addresses have been obscured, the required substitutions should be obvious\&. ecm-6.4.4/AUTHORS0000644023561000001540000000304212106741273010265 00000000000000In the following email addresses, please replace "at" by "@" and "dot" by ".". For example, should read . Cyril Bouvier contributed the batch mode and GPU for stage 1 Pierrick Gaudry contributed efficient assembly code for combined mul/redc. Brian Gladman contributed the Visual C build files Jim Fougeron contributed the expression parser, the primality testing tools, and several command-line options. Laurent Fousse contributed the middle product code, the autoconf/automake tools, and author of the Debian package. Alexander Kruppa (substitute appropriately) joined Paul Zimmermann at release 5, contributed the Toom-Cook multiplication code, the special code for Fermat numbers, and many other nice things. Dave Newman contributed the Kronecker-Schönhage multiplication code, and the NTT code. Jason S. Papadopoulos contributed optimizations to the NTT code. Paul Zimmermann author of the first version of the program. Several people also helped by suggesting improvements, or testing beta-versions: Allan Steel, Karim Belabas, Torbjörn Granlund, Japke Rosink, Bruce Dodson. If you want to contribute to GMP-ECM, you are welcome; the development version is available on . ecm-6.4.4/NEWS0000644023561000001540000002605312113416752007722 00000000000000Changes between GMP-ECM 6.4.3 and GMP-ECM 6.4.4: * Fixed PowerPC64 assembly code with --enable-shared (thanks Leif Leonhardy) * Fix to deal with change of semantics of internal GMP functions in GMP 5.1 * Fixed small memory leak in non-NTT P-1 stage 2 * Fixed segfaults with large non-NTT P+-1 stage 2 * Removed defunct -t command line option Changes between GMP-ECM 6.4.2 and GMP-ECM 6.4.3: * Fixed bug reported by user "lorgix" on mersenneforum (http://www.mersenneforum.org/showpost.php?p=286385&postcount=280) * Use 64-bit value for random seed under Windows Changes between GMP-ECM 6.4.1 and GMP-ECM 6.4.2: * Corrected the copyright headers * Reduced memory usage in stage 1 with -batch={1,2} mode. * Fixed bug in modular reduction (could occur only for numbers larger than 386 digits on 64-bit computers and 193 digits on 32-bit computers). * Speedup in stage 2 with the NTT default mode Changes between ecm-6.4 and ecm-6.4.1: * GMP-ECM is now distributed under the GPL version 3 or later for the binary, and under the LGPL version 3 or later for the library * Fixed a speed regression with respect to ecm-6.3 http://lists.gforge.inria.fr/pipermail/ecm-discuss/2012-February/004103.html * Fixed a bug with the -treefile option which had been present for a long time * Several fixes for the Visual Studio 2010 build * New experimental option -batch=2, and speedup for -batch (i.e., -batch=1) * New tuning mechanism, now --enable-asm-redc is always recommended * New configure option --enable-mulredc-svoboda, for input numbers whose low limbs is congruent to -1 * New tuning parameters for Intel Core i5 * New ecmbench utility Changes between ecm-6.3 and ecm-6.4: * Fixed configure problem with SSE2 (http://trac.sagemath.org/sage_trac/ticket/10252) * Fixed configure bug on 32-bit PowerPC (tried to use 64-bit assembly) https://gforge.inria.fr/tracker/index.php?func=detail&aid=10646 * Fixed dependencies from build directory https://gforge.inria.fr/tracker/index.php?func=detail&aid=10648 * Patch from David Cleaver to allow B1 >= 2^32 on machines where "unsigned long" has 32 bits only * Patch from David Cleaver to use GWNUM 26.6 on Windows x64 with MingW64/Msys * Improved conversion from mpz_t to residue number system in NTT code * Better asm code for AMD cpus * Use of GMP's mpn_mullo_n and mpn_redc_2 when available * New option -batch with faster Stage 1 (but smaller success probability) * Added Visual Studio 2010 build Changes between ecm-6.2.3 and ecm-6.3: * New assembly code for 64-bit PowerPC (thanks to Philip McLaughlin) * Allow several processes to write to the same -save file * More routines in new P+-1 stage 2 use multi-threading in OpenMP build * Fixed incompatibility with GMP 5.0.0 * Fixed several bugs, and now check return value from malloc() calls * Fixed linking of GMP which prevented successful builds under Darwin (and presumably other systems) * Allow use of x86_64 asm code under MinGW Changes between ecm-6.2.2 and ecm-6.2.3: * Fixed incompatibility with GMP 4.3.0 when testing version in configure * SSE2 asm code for Visual C added in stage 2 NTT code * Small improvement to x86_64 mulredc asm code, slight speedup on Core 2 * Fixed incorrect carry propagation in subquadratic REDC code which could lead to incorrect arithmetic in rare cases * Fixed memory leak with -v parameter when factor was found in ECM stage 1 * Fixed bug which caused only one ECM curve to be run in spite of -c parameter if input line did not end in newline * Assembler mulredc code enabled by default on x86_64 Changes between ecm-6.2.1 and ecm-6.2.2: * Updated build project files for Visual C by Brian Gladman, also adds missing NTT_GFP_TWIDDLE_DI[FT]_BREAKOVER defines in VC parameter file * Fixed uninitialised parameter to P-1 probability computation * In tune.c : fixed generation of NTT_GFP_TWIDDLE_DI[FT]_BREAKOVER values, avoid calling cputime() excessively often when timing short functions, fixed access to uninitialised memory * Fixed serious split infinitive in configure script (thanks Paul Leyland) * Removed unnecessary carry propagation in x86_64 mulredc code, slight speedup (thanks Philip McLaughlin) * Fixed non-portable PIC code in x86_64/redc.asm * Fixed problem with pattern matching host type names in configure.in * Converted binary constants in spv.c and ntt_gfp.c to hexadecimal, some assembler do not support binary constants Changes between ecm-6.2 and ecm-6.2.1: * Default B2 for new P-1 and P+1 stage 2 increased * Probabilities for finding factors with P-1 are now printed with -v * Fixed compilation problem on IA64, EV56, and ARM * Made threshold between recursive and iterative NTT tunable Changes between ecm-6.1.3 and ecm-6.2: * New stage 2 for P-1 and P+1, described in Montgomery and Kruppa, Improved Stage 2 to P+-1 Factoring Algorithms, in A. J. van der Poorten and A. Stein (Eds.), ANTS-VIII 2008, LNCS 5011, pp. 180-195. * Parallelization in the new P+-1 stage 2 (with --enable-openmp). * Optimizations to the NTT code by Jason S. Papadopoulos * Improved mulredc assembly code for Athlon64/Opteron * Improved modular reduction in the mpzmod range * Bugfix in P+1 stage 2 which caused incorrect initialisation if Brent-Suyama polynomial had degree > 1 and i0 was negative (occurs only with non-standard parameters) * Bugfix in generation of Lucas chains for P+1 and ECM, causing some stage 1 primes close to 2^32 to be processed incorrectly on 32 bit systems * Added build project for VC++ by Brian Gladman * File ecm.h changed from GPL to LGPL: the fact it was under GPL was an unvoluntary mistake, which has the consequence that applications linking with libecm for version < 6.2 should be under GPL too. * Fixed a regression introduced in 6.1.1: the default arithmetic (NTT) for stage 2 was slower for large inputs. Now defaults to -no-ntt for input numbers >30 machine words. Changes between ecm-6.1.2 and ecm-6.1.3: * fixed incorrect computation of memory use in stage 2, especially for machines that use Kronecker-Schoenhage multiplication even for large degrees, such as Core 2. * fixed -B2scale option whose value hadn't been passed to the factoring routines * fixed default B2min for P-1, which could be truncated on 32 bit machines, causing stage 2 to take a little longer than necessary * fixed bug for modular multiplication modulo Fermat numbers 2^2^n+1, where a result of 2^2^n would be truncated to 0. Changes between ecm-6.1.1 and ecm-6.1.2: * changed copyright header from sp.h, to recognize the FSF for parts inspired or taken from gmp-impl.h. Changes between ecm-6.0.1 and ecm-6.1: * new assembly code contributed by Pierrick Gaudry for combined mul/redc * new Number Theoretic Transform code contributed by Dave Newman for step 2 * new signal handling and corresponding save files for step 1 * now prints peak memory allocation with -v * improved and simplified tuning * command-line options: added -idlecmd -no-ntt -prpcmd -stage1time -maxmem, removed -prp* * new configure options --enable-asm-redc, --with-gwnum * new exit status codes of ecm program * new interface to George Woltman's GWNUM library (see INSTALL) * chosen stage 2 bound is now printed right away * fixed minor memory leak in mpmod.c/isbase2 * fixed invalid Found input number N reported on some numbers * fixed serious P+1 bug on 64-bit architectures with B1 > 2^32 * fixed wrong detection of divisors of 2^n+1 or 2^n-1 * fixed memory leaks Changes between ecm-6.0 and ecm-6.0.1: * now checks for availability of snprintf() during configure * fixed linking problems with tune and tune2 on PowerPC G5 * fixed segfault in rho.c * fixed main()'s B2 value being overwritten by callees * allow both \r and \n for newline (for Apple computers) * made files compile under Visual C * fixed bug in listz.c that could leave undefined data * fixed the -B2scale option * fixed small error in printed B2' value (with -v) * added Windows section to INSTALL * small corrections to ecm.xml and ecm.1 * added curve counter in loop mode again * fixed segfault when a non-number was in place of B2 on command line * worked around problem with MinGW/Wine scanf() (value 1 too high for %n) * free rhotable memory at end of stage 2 * replaced GSL's dilog_series() in rho.c due to licensing (GPL vs. LGPL) Changes between ecm-5.0.3 and ecm-6.0: * use of the autotools (configure/make) * there is now a documentation in "man" format (ecm.1) * added a set of -prp* command line switches which will use an external program to perform prp testing of candidate and factors. When the numbers get large, GMP becomes very non-optimal in PRP testing. An external program, such as OpenPFGW, can be much faster than the GMP. (New code from Phil Carmody) * new parser for symbolic input * added hex number input into the expression parser. This was needed due to save files from Prime95 being output in hex. ecm was not resuming these. * added some porting code so ecm builds under VC6. VC6 builds an ecm that is is about 1% to 3% faster than the MinGW build (which is about 1% to 3% faster than a Cygwin build) * quiet mode (-q) now prints on stdout all factors found on the same line: f1 f2 ... fk ccc where ccc is the remaining composite (contributed by Laurent Fousse). Example: $ echo 438573459834757 | ./ecm -sigma 6 -q 1e2 2166151 202466707 * special code for Fermat numbers, which improves both steps, example for F12: GMP-ECM 5.0.3 [powered by GMP 4.1.4] [ECM] Input number has 1187 digits Using B1=100000, B2=31565866, polynomial x^2, sigma=4155936925 Step 1 took 27157ms Step 2 took 20830ms GMP-ECM 5.2.0 [powered by GMP 4.1.4] [ECM] Input number has 1187 digits Using B1=100000, B2=31565866, polynomial x^2, sigma=535125396 Step 1 took 24264ms Step 2 took 6605ms * speed improvement in step 2, especially for large B2 (here with the c155 in the GMP-ECM sources), using Kronecker/Schonhage multiplication: GMP-ECM 5.0.3 [powered by GMP 4.1.4] [ECM] Using B1=3000000, B2=4016636514, polynomial Dickson(12), sigma=2383768044 Step 1 took 62305ms Step 2 took 45322ms GMP-ECM 5.2.0 [powered by GMP 4.1.4] [ECM] Using B1=3000000, B2=4016636514, polynomial Dickson(12), sigma=3595368442 Step 1 took 61824ms Step 2 took 32989ms * Brent-Suyama's extension now works for P+1 too * trial division is available (option -t n) * new options -n (low priority) and -nn (idle priority) * the looping mode (-c n) now continues to try to factor the composite when a factor is found (use -one to stop) * new option -ve n to display only inputs of <= n characters (looping mode) * new option -treefile which stores product tree of F on disk to save memory * generation of roots use double sieve idea, increases B2 for given k, dF * generation of roots for ECM rewritten to reduce number of extgcds Changes between ecm-5.0 and ecm-5.0.1: * fixed bug when B1 or B2min is too large, and an overflow occurs in step 2, making the computations incorrect. The new limit for B1 or B2min is now around 2^53, and an error occurs when this limit is passed. * fixed problem with save lines incorrectly written when both sigma and A are given with -save. * fixed efficiency problem with -pm1: for B1 > 1e6, use -redc or -mpz_mod by default. ecm-6.4.4/batch.c0000644023561000001540000002720112106741273010445 00000000000000/* batch.c - Implement batch mode for step 1 of ECM Copyright 2011, 2012 Cyril Bouvier, Paul Zimmermann and David Cleaver. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ /* ECM stage 1 in batch mode, for initial point (x:z) with small coordinates, such that x and z fit into a mp_limb_t. For example we can start with (x=2:y=1) with the curve by^2 = x^3 + ax^2 + x with a = 4d-2 and b=16d+2, then we have to multiply by d=(a+2)/4 in the duplicates. With the change of variable x=b*X, y=b*Y, this curve becomes: Y^2 = X^3 + a/b*X^2 + 1/b^2*X. */ #include "ecm-impl.h" #define MAX_HEIGHT 32 #if ULONG_MAX == 4294967295 #define MAX_B1_BATCH 2977044736UL #else /* nth_prime(2^(MAX_HEIGHT-1)) */ #define MAX_B1_BATCH 50685770167UL #endif void compute_s (mpz_t s, unsigned long B1) { mpz_t acc[MAX_HEIGHT]; /* To accumulate products of prime powers */ unsigned int i, j; unsigned long pi = 2, pp, maxpp; ASSERT_ALWAYS (B1 < MAX_B1_BATCH); for (j = 0; j < MAX_HEIGHT; j++) mpz_init (acc[j]); /* sets acc[j] to 0 */ i = 0; while (pi <= B1) { pp = pi; maxpp = B1 / pi; while (pp <= maxpp) pp *= pi; if ((i & 1) == 0) mpz_set_ui (acc[0], pp); else mpz_mul_ui (acc[0], acc[0], pp); j = 0; /* We have accumulated i+1 products so far. If bits 0..j of i are all set, then i+1 is a multiple of 2^(j+1). */ while ((i & (1 << j)) != 0) { /* we use acc[MAX_HEIGHT-1] as 0-sentinel below, thus we need j+1 < MAX_HEIGHT-1 */ ASSERT (j + 1 < MAX_HEIGHT - 1); if ((i & (1 << (j + 1))) == 0) /* i+1 is not multiple of 2^(j+2), thus add[j+1] is "empty" */ mpz_swap (acc[j+1], acc[j]); /* avoid a copy with mpz_set */ else mpz_mul (acc[j+1], acc[j+1], acc[j]); /* accumulate in acc[j+1] */ mpz_set_ui (acc[j], 1); j++; } i++; pi = getprime (pi); } for (mpz_set (s, acc[0]), j = 1; mpz_cmp_ui (acc[j], 0) != 0; j++) mpz_mul (s, s, acc[j]); getprime_clear (); /* free the prime tables, and reinitialize */ for (i = 0; i < MAX_HEIGHT; i++) mpz_clear (acc[i]); } /* Return the number of bytes written */ int write_s_in_file (char *fn, mpz_t s) { FILE *file; int ret = 0; #ifdef DEBUG if (fn == NULL) { fprintf (stderr, "write_s_in_file: fn == NULL\n"); exit (EXIT_FAILURE); } #endif file = fopen (fn, "w"); if (file == NULL) { fprintf (stderr, "Could not open file %s for writing\n", fn); return 0; } ret = mpz_out_raw (file, s); fclose (file); return ret; } void read_s_from_file (mpz_t s, char *fn) { FILE *file; int ret = 0; #ifdef DEBUG if (fn == NULL) { fprintf (stderr, "read_s_from_file: fn == NULL\n"); exit (EXIT_FAILURE); } #endif file = fopen (fn, "r"); if (file == NULL) { fprintf (stderr, "Could not open file %s for reading\n", fn); exit (EXIT_FAILURE); } ret = mpz_inp_raw (s, file); if (ret == 0) { fprintf (stderr, "read_s_from_file: 0 bytes read from %s\n", fn); exit (EXIT_FAILURE); } fclose (file); } #ifndef GPUECM #if 0 /* this function is useful in debug mode to print non-normalized residues */ static void mpresn_print (mpres_t x, mpmod_t n) { mp_size_t m, xn; xn = SIZ(x); m = ABSIZ(x); MPN_NORMALIZE(PTR(x), m); SIZ(x) = xn >= 0 ? m : -m; gmp_printf ("%Zd\n", x); SIZ(x) = xn; } #endif /* (x1:z1) <- 2(x1:z1) (x2:z2) <- (x1:z1) + (x2:z2) assume (x2:z2) - (x1:z1) = (2:1) Uses 4 modular multiplies and 4 modular squarings. Inputs are x1, z1, x2, z2, d, n. Use two auxiliary variables: t, w (it seems using one only is not possible if all mpresn_mul and mpresn_sqr calls don't overlap input and output). In the batch 1 mode, we pass d_prime such that the actual d is d_prime/beta. Since beta is a square, if d_prime is a square (on 64-bit machines), so is d. In mpresn_mul_1, we multiply by d_prime = beta*d and divide by beta. */ static void dup_add_batch1 (mpres_t x1, mpres_t z1, mpres_t x2, mpres_t z2, mpres_t t, mpres_t w, mp_limb_t d_prime, mpmod_t n) { /* active: x1 z1 x2 z2 */ mpresn_addsub (w, z1, x1, z1, n); /* w = x1+z1, z1 = x1-z1 */ /* active: w z1 x2 z2 */ mpresn_addsub (x1, x2, x2, z2, n); /* x1 = x2+z2, x2 = x2-z2 */ /* active: w z1 x1 x2 */ mpresn_mul (z2, w, x2, n); /* w = (x1+z1)(x2-z2) */ /* active: w z1 x1 z2 */ mpresn_mul (x2, z1, x1, n); /* x2 = (x1-z1)(x2+z2) */ /* active: w z1 x2 z2 */ mpresn_sqr (t, z1, n); /* t = (x1-z1)^2 */ /* active: w t x2 z2 */ mpresn_sqr (z1, w, n); /* z1 = (x1+z1)^2 */ /* active: z1 t x2 z2 */ mpresn_mul (x1, z1, t, n); /* xdup = (x1+z1)^2 * (x1-z1)^2 */ /* active: x1 z1 t x2 z2 */ mpresn_sub (w, z1, t, n); /* w = (x1+z1)^2 - (x1-z1)^2 */ /* active: x1 w t x2 z2 */ mpresn_mul_1 (z1, w, d_prime, n); /* z1 = d * ((x1+z1)^2 - (x1-z1)^2) */ /* active: x1 z1 w t x2 z2 */ mpresn_add (t, t, z1, n); /* t = (x1-z1)^2 - d* ((x1+z1)^2 - (x1-z1)^2) */ /* active: x1 w t x2 z2 */ mpresn_mul (z1, w, t, n); /* zdup = w * [(x1-z1)^2 - d* ((x1+z1)^2 - (x1-z1)^2)] */ /* active: x1 z1 x2 z2 */ mpresn_addsub (w, z2, x2, z2, n); /* active: x1 z1 w z2 */ mpresn_sqr (x2, w, n); /* active: x1 z1 x2 z2 */ mpresn_sqr (w, z2, n); /* active: x1 z1 x2 w */ mpresn_add (z2, w, w, n); } static void dup_add_batch2 (mpres_t x1, mpres_t z1, mpres_t x2, mpres_t z2, mpres_t t, mpres_t w, mpres_t d, mpmod_t n) { /* active: x1 z1 x2 z2 */ mpresn_addsub (w, z1, x1, z1, n); /* w = x1+z1, z1 = x1-z1 */ /* active: w z1 x2 z2 */ mpresn_addsub (x1, x2, x2, z2, n); /* x1 = x2+z2, x2 = x2-z2 */ /* active: w z1 x1 x2 */ mpresn_mul (z2, w, x2, n); /* w = (x1+z1)(x2-z2) */ /* active: w z1 x1 z2 */ mpresn_mul (x2, z1, x1, n); /* x2 = (x1-z1)(x2+z2) */ /* active: w z1 x2 z2 */ mpresn_sqr (t, z1, n); /* t = (x1-z1)^2 */ /* active: w t x2 z2 */ mpresn_sqr (z1, w, n); /* z1 = (x1+z1)^2 */ /* active: z1 t x2 z2 */ mpresn_mul (x1, z1, t, n); /* xdup = (x1+z1)^2 * (x1-z1)^2 */ /* active: x1 z1 t x2 z2 */ mpresn_sub (w, z1, t, n); /* w = (x1+z1)^2 - (x1-z1)^2 */ /* active: x1 w t x2 z2 */ mpresn_mul (z1, w, d, n); /* z1 = d * ((x1+z1)^2 - (x1-z1)^2) */ /* active: x1 z1 w t x2 z2 */ mpresn_add (t, t, z1, n); /* t = (x1-z1)^2 - d* ((x1+z1)^2 - (x1-z1)^2) */ /* active: x1 w t x2 z2 */ mpresn_mul (z1, w, t, n); /* zdup = w * [(x1-z1)^2 - d* ((x1+z1)^2 - (x1-z1)^2)] */ /* active: x1 z1 x2 z2 */ mpresn_addsub (w, z2, x2, z2, n); /* active: x1 z1 w z2 */ mpresn_sqr (x2, w, n); /* active: x1 z1 x2 z2 */ mpresn_sqr (w, z2, n); /* active: x1 z1 x2 w */ mpresn_add (z2, w, w, n); } /* Input: x is initial point A is curve parameter in Montgomery's form: g*y^2*z = x^3 + a*x^2*z + x*z^2 n is the number to factor B1 is the stage 1 bound Output: If a factor is found, it is returned in x. Otherwise, x contains the x-coordinate of the point computed in stage 1 (with z coordinate normalized to 1). B1done is set to B1 if stage 1 completed normally, or to the largest prime processed if interrupted, but never to a smaller value than B1done was upon function entry. Return value: ECM_FACTOR_FOUND_STEP1 if a factor, otherwise ECM_NO_FACTOR_FOUND */ /* For now we don't take into account go stop_asap and chkfilename */ int ecm_stage1_batch (mpz_t f, mpres_t x, mpres_t A, mpmod_t n, double B1, double *B1done, int batch, mpz_t s) { mp_limb_t d_1; mpz_t d_2; mpres_t x1, z1, x2, z2; unsigned long i; mpres_t t, u; int ret = ECM_NO_FACTOR_FOUND; MEMORY_TAG; mpres_init (x1, n); MEMORY_TAG; mpres_init (z1, n); MEMORY_TAG; mpres_init (x2, n); MEMORY_TAG; mpres_init (z2, n); MEMORY_TAG; mpres_init (t, n); MEMORY_TAG; mpres_init (u, n); if (batch == 2) { MEMORY_TAG; mpres_init (d_2, n); } MEMORY_UNTAG; /* initialize P */ mpres_set (x1, x, n); mpres_set_ui (z1, 1, n); /* P1 <- 1P */ /* Compute d=(A+2)/4 from A and d'=B*d thus d' = 2^(GMP_NUMB_BITS-2)*(A+2) */ if (batch == 1) { mpres_get_z (u, A, n); mpz_add_ui (u, u, 2); mpz_mul_2exp (u, u, GMP_NUMB_BITS - 2); mpres_set_z_for_gcd (u, u, n); /* reduces u mod n */ if (mpz_size (u) > 1) { mpres_get_z (u, A, n); outputf (OUTPUT_ERROR, "Error, d'=B*(A+2)/4 should fit in a mp_limb_t, A=%Zd\n", u); return ECM_ERROR; } d_1 = mpz_getlimbn (u, 0); } else { /* b = (A0+2)*B/4, where B=2^(k*GMP_NUMB_BITS) for MODMULN or REDC, B=2^GMP_NUMB_BITS for batch1, and B=1 otherwise */ mpres_add_ui (d_2, A, 2, n); mpres_div_2exp (d_2, d_2, 2, n); } /* Compute 2P : no need to duplicate P, the coordinates are simple. */ mpres_set_ui (x2, 9, n); if (batch == 1) /* here d = d_1 / GMP_NUMB_BITS */ { /* warning: mpres_set_ui takes an unsigned long which has only 32 bits on Windows, while d_1 might have 64 bits */ ASSERT_ALWAYS (mpz_size (u) == 1 && mpz_getlimbn (u, 0) == d_1); mpres_set_z (z2, u, n); mpres_div_2exp (z2, z2, GMP_NUMB_BITS, n); } else mpres_set (z2, d_2, n); mpres_mul_2exp (z2, z2, 6, n); mpres_add_ui (z2, z2, 8, n); /* P2 <- 2P = (9 : : 64d+8) */ /* invariant: if j represents the upper bits of s, then P1 = j*P and P2=(j+1)*P */ mpresn_pad (x1, n); mpresn_pad (z1, n); mpresn_pad (x2, n); mpresn_pad (z2, n); /* now perform the double-and-add ladder */ if (batch == 1) { for (i = mpz_sizeinbase (s, 2) - 1; i-- > 0;) { if (mpz_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ /* P2 <- P1+P2 P1 <- 2*P1 */ dup_add_batch1 (x1, z1, x2, z2, t, u, d_1, n); else /* (j,j+1) -> (2j+1,2j+2) */ /* P1 <- P1+P2 P2 <- 2*P2 */ dup_add_batch1 (x2, z2, x1, z1, t, u, d_1, n); } } else /* batch = 2 */ { mpresn_pad (d_2, n); for (i = mpz_sizeinbase (s, 2) - 1; i-- > 0;) { if (mpz_tstbit (s, i) == 0) /* (j,j+1) -> (2j,2j+1) */ /* P2 <- P1+P2 P1 <- 2*P1 */ dup_add_batch2 (x1, z1, x2, z2, t, u, d_2, n); else /* (j,j+1) -> (2j+1,2j+2) */ /* P1 <- P1+P2 P2 <- 2*P2 */ dup_add_batch2 (x2, z2, x1, z1, t, u, d_2, n); } } *B1done=B1; mpresn_unpad (x1); mpresn_unpad (z1); if (!mpres_invert (u, z1, n)) /* Factor found? */ { mpres_gcd (f, z1, n); ret = ECM_FACTOR_FOUND_STEP1; } mpres_mul (x, x1, u, n); mpz_clear (x1); mpz_clear (z1); mpz_clear (x2); mpz_clear (z2); mpz_clear (t); mpz_clear (u); if (batch == 2) { mpz_clear (d_2); } return ret; } #endif ecm-6.4.4/ecm-params.h.core20000644023561000001540000000316612106741273012433 00000000000000/* produced on pasta.loria.fr (Intel(R) Core(TM)2 CPU 6700 @ 2.66GHz) */ #ifndef HAVE_MPIR /* tuning parameters for GMP, tuned for GMP 5.0.4 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,2,0,2,0,2,1,1,1,1,2,2,1,2,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 9, 10, 12, 11, 12, 13, 12, 12, 14, 16, 16, 16, 18, 18, 18} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 8 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 128 #else /* tuning parameters for MPIR, tuned for MPIR 2.5.1 */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,1,1,2,2,1,1,1,1,1,1,2,1,2} #define MPZMOD_THRESHOLD 21 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 0, 6, 6, 7, 8, 9, 9, 11, 10, 10, 11, 12, 13, 14, 14, 11, 13, 18, 18, 14, 20, 16, 18, 18, 20} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 256 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 128 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 32 #endif ecm-6.4.4/pm1fs2.c0000644023561000001540000043420512113200327010466 00000000000000/* Implementation of fast stage 2 for P-1 and P+1 as described in "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic Number Theory Symposium). Copyright 2007, 2008, 2009, 2010, 2011, 2012 Alexander Kruppa, Paul Zimmermann. NTT functions are based on code Copyright 2005 Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include #include #include #include "ecm-impl.h" #include "sp.h" #include #ifdef HAVE_ALLOCA_H #include #endif #ifdef HAVE_STRING_H #include #endif #ifdef _OPENMP #include #endif /* TODO: - move functions into their proper files (i.e. NTT functions etc.) - later: allow storing NTT vectors on disk */ /* Define TEST_ZERO_RESULT to test if any result of the multipoint evaluation is equal to zero. If the modulus is composite, this happening might indicate a problem in the evalutaion code */ #define TEST_ZERO_RESULT const int pari = 0; const unsigned long Pvalues[] = { 3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL, 105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL, 357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL, 945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL, 2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL, 4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL, 7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL, 12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL, 23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL, 36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL, 58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL, 88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL, 121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL, 167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL, 225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL, 345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL, 454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL, 647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL, 858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL, 1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL, 1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL, 1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL, 2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL, 3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL, 3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL, 4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL, 6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL, 8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL, 11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL, 14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL, 18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL, 24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL, 30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL, 37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL, 45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL, 55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL, 66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL, 79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL, 98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL, 121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL, 150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL, 190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL, 229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL, 282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL, 334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL, 387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL, 453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL, 557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL, 654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL, 790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL, 936020085UL, 984518535UL, 1017041025UL, 1052416365UL #if (ULONG_MAX > 4294967295) ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL, 1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL, 1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL, 1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL, 2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL, 2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL, 2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL, 3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL, 4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL, 5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL, 6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL, 7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL, 8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL, 10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL, 11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL, 13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL, 15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL, 17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL, 19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL, 21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL, 24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL, 27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL, 30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL, 35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL, 40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL, 45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL, 53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL, 59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL, 67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL, 75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL, 86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL, 100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL, 109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL, 127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL, 148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL, 164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL #endif }; /* All the prime factors that can appear in eulerphi(P) */ const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL, 17UL, 19UL}; /* Some useful PARI functions: sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l} V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)} U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X) *U( (i-1)/2, X) + 1)} */ #ifndef _OPENMP static int omp_get_num_threads () {return 1;} static int omp_get_thread_num () {return 0;} #endif static void ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t, const mpzspm_t); static void print_elapsed_time (int verbosity, long cpu_start, ATTRIBUTE_UNUSED long real_start) { #ifdef _OPENMP if (real_start != 0L) { outputf (verbosity, " took %lums (%lums real)\n", elltime (cpu_start, cputime()), elltime (real_start, realtime())); return; } #endif outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime())); } static void print_CRT_primes (const int verbosity, const char *prefix, const mpzspm_t ntt_context) { double modbits = 0.; unsigned int i; if (test_verbose (verbosity)) { outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp); modbits += log ((double) ntt_context->spm[0]->sp); for (i = 1; i < ntt_context->sp_num; i++) { outputf (verbosity, " * %lu", ntt_context->spm[i]->sp); modbits += log ((double) ntt_context->spm[i]->sp); } outputf (verbosity, ", has %d primes, %f bits\n", ntt_context->sp_num, modbits / log (2.)); } } /* Approximate amount of memory in bytes each coefficient in an NTT takes so that NTT can do transforms up to length lmax with modulus, or with 2*modulus if twice != 0 */ static size_t ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice) { mpz_t t; size_t n; mpz_init (t); mpz_mul (t, modulus, modulus); mpz_mul_ui (t, t, lmax); if (twice) mpz_mul_2exp (t, t, 1UL); /* +4: +1 for rounding up, +3 for extra words due to ECRT */ n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4; mpz_clear (t); return n * sizeof (sp_t); } size_t pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus, const int use_ntt) { if (use_ntt) { /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax coefficients for G in NTT ready format. Each coefficient in NTT-ready format occupies approx. ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */ size_t n; n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1); outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use " "with lmax = %lu NTT is %lu bytes\n", lmax, n); return n; } else { /* F stores s_1/2 residues, h stores s_1 mpz_t structs (residues get cloned from F) g stores lmax residues, R stores lmax-s_1 residues, and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues. Assume s_1 is close to lmax/2. Then we have lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2) = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t. */ size_t n; n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t); n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2); n += lmax / 2 * sizeof (mpz_t); /* Memory use due to temp space allocation in TMulKS appears to approximately triple the estimated memory use. This is hard to estimate precisely, so let's go with the fudge factor of 3 here */ n *= 3; outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use " "with lmax = %lu is %lu bytes\n", lmax, n); return n; } } /* return the possible lmax for given memory use and modulus */ unsigned long pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt) { if (use_ntt) { size_t n, lmax = 1; n = ntt_coeff_mem (lmax, modulus, 0); lmax = 1UL << ceil_log2 (memory / n / 3); return lmax; } else { size_t lmax, n; n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t); /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */ /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */ /* Fudge factor of 3 for TMulKS as above */ lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2); return lmax; } } size_t pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus, const int use_ntt, const int twopass) { size_t n, m; m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t); if (use_ntt) { /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and g_x_ntt as before, plus R which is lmax - s_1 mpz_t. We assume s_1 ~= lmax/2. */ n = ntt_coeff_mem (lmax, modulus, !twopass); if (twopass) return lmax * (2 * n + m / 2); else return lmax * 3 * n; } else { /* We allocate: F: s_1/2 coefficients fh_x, fh_y: s_1/2 coefficients h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y) g_x, g_y: lmax coefficients R_x, R_y: lmax - s_1 coefficients tmp: 3UL * lmax + list_mul_mem (lmax / 2) Assuming s_1 ~ lmax/2, that's lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax + list_mul_mem (lmax / 2) = 7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t. */ n = m * (7 * lmax + list_mul_mem (lmax / 2)); n += lmax * sizeof (mpz_t); n = 5 * n / 2; /* A fudge factor again */ return n; } } unsigned long pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt, const int twopass) { size_t n, m; m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t); if (use_ntt) { n = ntt_coeff_mem (1, modulus, !twopass); if (twopass) n = memory / (2 * n + m / 2); else n = memory / (3 * n); return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */ } else { return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2; } } /* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the stage 2 interval [B2min, B2] is covered. The effective B2min and B2 are stored in effB2min and effB2 */ static int test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P, const unsigned long nr, mpz_t effB2min, mpz_t effB2) { mpz_t m; /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1 Choose m_1 accordingly */ mpz_init (m); sets_max (m, P); mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */ mpz_sub (m_1, B2min, m); mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */ mpz_add_ui (m_1, m_1, P); mpz_fdiv_q_2exp (m_1, m_1, 1UL); mpz_fdiv_q_ui (m_1, m_1, P); /* 2UL*P may overflow */ /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */ mpz_mul_2exp (effB2min, m_1, 1UL); mpz_sub_ui (effB2min, effB2min, 1UL); mpz_mul_ui (effB2min, effB2min, P); mpz_add (effB2min, effB2min, m); mpz_add_ui (effB2min, effB2min, 1UL); ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0); /* Compute the smallest value coprime to P at the high end of the stage 2 interval that will not be covered: 2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P. We assume min(S_1 + S_2) = -max(S_1 + S_2) */ mpz_add_ui (effB2, m_1, nr); mpz_mul_2exp (effB2, effB2, 1UL); mpz_add_ui (effB2, effB2, 1UL); mpz_mul_ui (effB2, effB2, P); mpz_sub (effB2, effB2, m); /* The effective B2 values is that value, minus 1 */ mpz_sub_ui (effB2, effB2, 1UL); mpz_clear (m); return (mpz_cmp (B2, effB2) <= 0); } static void factor_phiP (int *exponents, const unsigned long phiP) { const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long); unsigned long cofactor = phiP; int i; ASSERT_ALWAYS (phiP > 0UL); for (i = 0; i < nrprimes; i++) for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++) cofactor /= phiPfactors[i]; ASSERT_ALWAYS (cofactor == 1UL); } static unsigned long pow_ul (const unsigned long b, const unsigned int e) { unsigned long r = 1UL; unsigned int i; for (i = 0; i < e; i++) r *= b; return r; } static unsigned long absdiff_ul (unsigned long a, unsigned long b) { return (a > b) ? a - b : b - a; } /* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even, s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal under those conditions. If use_ntt == 1, we require s_1 < l. Returns 0 if no such choice is possible */ static unsigned long choose_s_1 (const unsigned long phiP, const unsigned long min_s2, const unsigned long l, const int use_ntt) { const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long); /* Using [nrprimes] here makes the compiler complain about variable-sized arrays */ int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)], exponents[sizeof (phiPfactors) / sizeof (unsigned long)]; unsigned long s_1 = 0UL, s_2 = 0UL, trys_1; int i; ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0); /* We want only even s_1. We divide one 2 out of phiP here... */ factor_phiP (phiPexponents, phiP / 2); for (i = 0; i < nrprimes; i++) exponents[i] = 0; do { trys_1 = 2; /* ... and add a 2 here */ for (i = 0; i < nrprimes; i++) trys_1 *= pow_ul (phiPfactors[i], exponents[i]); #if 0 printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1); #endif /* See if it satisfies all the required conditions and is an improvement over the previous choice */ if (phiP / trys_1 >= min_s2 && (s_2 == 0UL || phiP / trys_1 < s_2) && absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) && (use_ntt == 0 || trys_1 < l)) { #if 0 printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, " "l = %lu : %lu\n", phiP, min_s2, l, trys_1); #endif s_1 = trys_1; } for (i = 0; i < nrprimes; i++) { if (++(exponents[i]) <= phiPexponents[i]) break; exponents[i] = 0; } } while (i < nrprimes); return s_1; } /* Approximate cost of stage 2. Cost with and without ntt are not comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */ static unsigned long est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt, const int method) { if (method == ECM_PM1) { /* The time for building f, h and DCT-I of h seems to be about 7/6 of the time of computing g, h*g and gcd with NTT, and 3/2 of the time of computing g, h*g and gcd without NTT */ if (use_ntt) return (7 * l) / 6 + s_2 * l; else return (3 * l) / 2 + s_2 * l; } else if (method == ECM_PP1) { /* Building f is the same, building h and its forward transform is twice about as expensive as for P-1. Each multi-point evaluation is twice as expensive as for P-1. FIXME: The estimate for NTT assumes the "one-pass" variant, in "two-pass" the multipoint evaluations are slower, so the optimum shifts towards smaller s_2 some more */ if (use_ntt) return (4 * l) / 5 + s_2 * l; else return (3 * l) / 4 + s_2 * l; } else abort (); /* Invalid value for method */ } /* Choose P so that a stage 2 range from B2min to B2 can be covered with multipoint evaluations, each using a convolution of length at most lmax. The parameters for stage 2 are stored in finalparams, the final effective B2min and B2 values in final_B2min and final_B2, respecively. Each of these may be NULL, in which case the value is not stored. It is permissible to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */ long choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax, const unsigned long min_s2, faststage2_param_t *finalparams, mpz_t final_B2min, mpz_t final_B2, const int use_ntt, const int method) { /* Let S_1 + S_2 == (Z/PZ)* (mod P). Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}). If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1. The largest value coprime to P at the low end of the stage 2 interval *not* covered will be 2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1). The smallest value at the high end not covered will be 2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1). Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1). Then the largest ... is: 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P The smallest ... is: -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1 The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1 Then the difference effB2max - effB2min = -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2 We obviously require B2max - B2min <= 2*nr*P Since nr < lmax, B2max - B2min <= 2*lmax*P or P >= ceil((B2max - B2min)/(2*lmax)) Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should be small, as long as s_1 is small enough. */ mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin; /* The best parameters found so far, P == 0 means that no suitable P has been found yet: */ unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0; unsigned int i; const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long); int r; outputf (OUTPUT_TRACE, "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, " "use_ntt = %d, method = %d\n", B2min, B2, lmax, min_s2, use_ntt, method); if (mpz_cmp (B2, B2min) < 0) return 0L; /* If we use the NTT, we allow only power-of-two transform lengths. In that case, the code below assumes that lmax is a power of two. If that is not the case, print error and return. */ if (use_ntt && (lmax & (lmax - 1UL)) != 0) { outputf (OUTPUT_ERROR, "choose_P: Error, lmax = %lu is not a power of two\n", lmax); return ECM_ERROR; } mpz_init (effB2); mpz_init (tryeffB2); mpz_init (effB2min); mpz_init (B2l); mpz_init (m_1); mpz_init (lmin); mpz_sub (B2l, B2, B2min); mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */ /* For each candidate P, check if [B2min, B2] can be covered at all, and if so, what the best parameters (minimizing the cost, maximizing effB2) are. If they are better than the best parameters for the best P so far, remember them. */ for (i = 0 ; i < Pvalues_len; i++) { unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost; tryP = Pvalues[i]; tryphiP = eulerphi (tryP); outputf (OUTPUT_TRACE, "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP); /* If we have a good P already and this tryphiP >= cost, then there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */ if (P != 0 && tryphiP >= cost) { outputf (OUTPUT_TRACE, "choose_P: tryphiP > cost = %lu, this P is too large\n", cost); continue; } /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need l >= B2l/P/2 */ mpz_cdiv_q_ui (lmin, B2l, tryP); mpz_cdiv_q_2exp (lmin, lmin, 1UL); outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP); if (mpz_cmp_ui (lmin, lmax) > 0) { outputf (OUTPUT_TRACE, "choose_P: lmin > lmax, this P is too small\n"); continue; } /* Try all possible transform lengths and store parameters in P, s_1, s_2, l if they are better than the previously best ones */ /* Keep reducing tryl to find best parameters. For NTT, we only have power of 2 lengths so far, so we can simply divide by 2. For non-NTT, we have arbitrary transform lengths so we can decrease in smaller steps... let's say by, umm, 25% each time? */ for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0; tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4) { trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt); if (trys_1 == 0) { outputf (OUTPUT_TRACE, "choose_P: could not choose s_1 for P = %lu, l = %lu\n", tryP, tryl); continue; } ASSERT (tryphiP % trys_1 == 0UL); trys_2 = tryphiP / trys_1; outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu " "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl); if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2)) { outputf (OUTPUT_TRACE, "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu " "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n", tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2); /* We use these parameters if we 1. didn't have any suitable ones yet, or 2. these cover [B2min, B2] and are cheaper than the best ones so far, or 3. they are as expensive but reach greater effB2. */ trycost = est_cost (trys_2, tryl, use_ntt, method); ASSERT (tryphiP < trycost); if (P == 0 || trycost < cost || (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0)) { outputf (OUTPUT_TRACE, "choose_P: and is the new optimum (cost = %lu)\n", trycost); P = tryP; s_1 = trys_1; s_2 = trys_2; l = tryl; cost = trycost; mpz_set (effB2, tryeffB2); } } } } if (P != 0) /* If we found a suitable P */ { /* Compute m_1, effB2min, effB2 again */ r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2); ASSERT_ALWAYS(r != 0); if (finalparams != NULL) { finalparams->P = P; finalparams->s_1 = s_1; finalparams->s_2 = s_2; finalparams->l = l; mpz_set (finalparams->m_1, m_1); } if (final_B2min != NULL) mpz_set (final_B2min, effB2min); if (final_B2 != NULL) mpz_set (final_B2, effB2); } mpz_clear (effB2); mpz_clear (tryeffB2); mpz_clear (effB2min); mpz_clear (B2l); mpz_clear (m_1); mpz_clear (lmin); return (P != 0) ? (long) P : ECM_ERROR; } static void list_output_poly (listz_t l, unsigned long len, int monic, int symmetric, char *prefix, char *suffix, int verbosity) { unsigned long i; if (prefix != NULL) outputf (verbosity, prefix); if (len == 0) { if (monic) outputf (verbosity, "1\n", len, len); else outputf (verbosity, "0\n", len); return; } if (monic) { if (symmetric) outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len); else outputf (verbosity, "x^%lu + ", len); } for (i = len - 1; i > 0; i--) if (symmetric) outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i); else outputf (verbosity, "%Zd * x^%lu + ", l[i], i); outputf (verbosity, "%Zd", l[0]); if (suffix != NULL) outputf (verbosity, suffix); } /* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */ /* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */ /* The input and output polynomials are monic and have the leading monomial implicit, i.e. not actually stored in the array of coefficients. */ /* Returns 0 if a modular inversion failed (in which case R is left unchanged), 1 otherwise */ static int ATTRIBUTE_UNUSED list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg, mpz_t modulus, listz_t tmp, ATTRIBUTE_UNUSED const unsigned long tmplen) { unsigned long i; ASSERT (tmplen >= 3); mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus); if (k < 0) { if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */ return 0; } /* Here, tmp[0] = r^k */ mpz_set (tmp[1], tmp[0]); /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */ for (i = 1; i + 1 <= deg; i++) { /* Here, tmp[1] = r^(ki) */ mpz_mul (tmp[2], S[deg-i], tmp[1]); mpz_mod (R[deg-i], tmp[2], modulus); mpz_mul (tmp[2], tmp[1], tmp[0]); /* FIXME, avoid unnecessary mul */ mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */ } if (i <= deg) { mpz_mul (tmp[2], S[deg-i], tmp[1]); mpz_mod (R[deg-i], tmp[2], modulus); } return 1; } /* Same, but does squaring which makes things easier */ static void list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l, mpz_t modulus, listz_t tmp, ATTRIBUTE_UNUSED const unsigned long tmplen) { unsigned long i; listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2; if (l == 0UL) return; /* FIXME: This modifies the input arguments. */ /* We have to divide S[0] by 2 */ ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l)); #if 0 gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]); for (i = 1; i < l1; i++) gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i); gmp_printf ("\n"); #endif if (mpz_odd_p (S[0])) { ASSERT_ALWAYS (mpz_odd_p (modulus)); mpz_add (S[0], S[0], modulus); } mpz_tdiv_q_2exp (S[0], S[0], 1UL); list_mul (r1, S, l, 0, S, l, 0, t); /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */ #if 0 for (i = 0; i < 2UL * l - 1UL; i++) gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]); #endif Srev = (listz_t) malloc (l * sizeof (mpz_t)); ASSERT_ALWAYS (Srev != NULL); for (i = 0UL; i < l; i++) (*Srev)[i] = (*S)[l - 1UL - i]; list_mul (r2, S, l, 0, Srev, l, 0, t); /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */ #if 0 for (i = 0; 0 && i < 2UL * l - 1UL; i++) gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]); #endif #ifdef WANT_ASSERT for (i = 0UL; i < l; i++) ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0); #endif free (Srev); /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */ #if 0 for (i = 0; i < 2UL * l - 1UL; i++) gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]); #endif mpz_mul_2exp (r1[0], r1[0], 1UL); /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */ for (i = 0UL; i < l; i++) { mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL); mpz_add (R[i], r1[i], r2[l - i - 1UL]); } /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */ /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */ for (i = l; i < 2UL * l - 1UL; i++) mpz_set (R[i], r1[i]); if (R != S) mpz_mul_2exp (S[0], S[0], 1UL); #if 0 for (i = 0; i < 2UL * l; i++) gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]); #endif } ATTRIBUTE_UNUSED static void list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l) { unsigned long i; mpz_set_ui (R, 0UL); for (i = 1; i < l; i++) mpz_add (R, R, S[i]); mpz_mul_2exp (R, R, 1UL); if (l > 0UL) mpz_add (R, R, S[0]); } /* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp., with coefficients in standard basis S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i}) S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i}) to the reciprocal polynomial of degree 2*(l1 + l2) - 4 R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i}) = S_1(x) * S_2(x) R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal to l2 (i.e. the multiplication must be a squaring) */ /* FIXME: This modifies the input arguments. */ /* We have to divide S1[0] and S2[0] by 2 */ static void list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1, listz_t S2, unsigned long l2, mpz_t modulus, listz_t tmp, ATTRIBUTE_UNUSED const unsigned long tmplen) { unsigned long i; const unsigned long lmax = MAX(l1, l2); listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2, t = tmp + 6*lmax - 3; #ifdef WANT_ASSERT mpz_t sum1, sum2, prod; #endif ASSERT (S1 < tmp || S1 >= tmp + tmplen); ASSERT (S2 < tmp || S2 >= tmp + tmplen); ASSERT (R < tmp || R >= tmp + tmplen); if (l1 == 0UL || l2 == 0UL) return; if (S1 == S2) { ASSERT_ALWAYS (l1 == l2); list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen); return; } ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax)); #ifdef WANT_ASSERT mpz_init (sum1); mpz_init (sum2); mpz_init (prod); list_recip_eval1 (sum1, S1, l1); list_recip_eval1 (sum2, S2, l2); mpz_mul (prod, sum1, sum2); mpz_mod (prod, prod, modulus); #endif /* Make S1 the longer of the two, i.e. l1 >= l2 */ if (l2 > l1) { listz_t St = S1; unsigned long lt = l1; S1 = S2; S2 = St; l1 = l2; l2 = lt; } #if 0 gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]); for (i = 1; i < l1; i++) gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i); gmp_printf ("\n"); gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]); for (i = 1; i < l1; i++) gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i); gmp_printf ("\n"); #endif /* Divide S1[0] and S2[0] by 2 */ if (mpz_odd_p (S1[0])) { ASSERT_ALWAYS (mpz_odd_p (modulus)); mpz_add (S1[0], S1[0], modulus); } mpz_tdiv_q_2exp (S1[0], S1[0], 1UL); if (mpz_odd_p (S2[0])) { ASSERT_ALWAYS (mpz_odd_p (modulus)); mpz_add (S2[0], S2[0], modulus); } mpz_tdiv_q_2exp (S2[0], S2[0], 1UL); /* Pad rev with zeros */ for (i = l2; i < lmax; i++) mpz_set_ui (rev[i], 0UL); for (i = 0UL; i < l2; i++) mpz_set (rev[i], S2[l2 - 1UL - i]); list_mul (r1, S1, lmax, 0, rev, lmax, 0, t); /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2, i.e. l1 + l2 - 1 entries. */ #if 0 for (i = 0; i < 2 * lmax - 1; i++) gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]); #endif for (i = 0UL; i < l2; i++) mpz_set(rev[i], S2[i]); list_mul (r2, S1, lmax, 0, rev, lmax, 0, t); /* \tilde{f}(x) \tilde{g}(x) */ #if 0 for (i = 0; i < 2 * lmax - 1; i++) gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]); #endif /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */ mpz_mul_2exp (r2[0], r2[0], 1UL); /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor. d_g = l2 - 1. */ for (i = 0; i < l1; i++) mpz_add (r2[i], r2[i], r1[i + l2 - 1]); /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor. d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want rev(r2)[l1 - 1 ... l1 + l2 - 2], hence r2[l2 - 1 ... 0] */ for (i = 0; i < l2; i++) mpz_add (r2[i], r2[i], r1[l2 - 1 - i]); #if 0 for (i = 0; i < l1 + l2 - 1; i++) gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]); #endif mpz_mul_2exp (S1[0], S1[0], 1UL); mpz_mul_2exp (S2[0], S2[0], 1UL); for (i = 0; i < l1 + l2 - 1; i++) mpz_set (R[i], r2[i]); #if 0 for (i = 0; i < l1 + l2 - 1; i++) gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]); #endif #ifdef WANT_ASSERT list_recip_eval1 (sum1, R, l1 + l2 - 1); mpz_mod (sum1, sum1, modulus); ASSERT (mpz_cmp (prod, sum1) == 0); mpz_clear (sum1); mpz_clear (sum2); mpz_clear (prod); #endif } /* Multiply a (possibly monic) polynomial A of length k * len with a (possibly monic) polynomial B of length len. R may be identical to A. */ static void ATTRIBUTE_UNUSED list_mul_blocks (listz_t R, const listz_t A, int monicA, const listz_t B, int monicB, const unsigned long len, const unsigned int k, listz_t tmp, ATTRIBUTE_UNUSED const unsigned long tmplen) { unsigned int j; if (k == 0 || len == 0) return; ASSERT (R != B); ASSERT (tmplen >= 3 * len + list_mul_mem (len)); /* Do first piece of A */ list_mul (tmp, A, len, (monicA && k == 1), B, len, monicB, tmp + 2 * len); list_set (R, tmp, len); /* May overwrite A[0 ... len-1] */ list_swap (tmp, tmp + len, len); /* Move high part to tmp[0 ... len-1] */ for (j = 1; j < k; j++) /* Process the remaining k-1 pieces of A */ { list_mul (tmp + len, A + j * len, len, (monicA && j + 1 == k), B, len, monicB, tmp + 3 * len); /* Add low part of this product and previous product's high part */ list_add (A + j * len, tmp, tmp + len, len); list_swap (tmp, tmp + 2 * len, len); /* Move this product's high part to beginning of tmp */ } list_set (A + j * len, tmp, len); /* Move the high part of last product */ } /* Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by V_k(X + 1/X) = X^k + 1/X^k */ static void V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus) { mpres_t V0, Vi, Vi1; unsigned long j, uk; int po2; if (k == 0L) { mpres_set_ui (R, 2UL, modulus); return; } uk = labs (k); if (uk == 1UL) { mpres_set (R, S, modulus); return; } for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++); mpres_init (V0, modulus); mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */ if (uk == 1UL) { mpres_set (R, S, modulus); while (po2-- > 0) { mpres_sqr (R, R, modulus); mpres_sub (R, R, V0, modulus); } mpres_clear (V0, modulus); return; } if (0) { mpz_t tz; mpz_init (tz); mpres_get_z (tz, S, modulus); gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz); mpz_clear (tz); } for (j = 1UL; j <= uk / 2UL; j <<= 1); mpres_init (Vi, modulus); mpres_init (Vi1, modulus); /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */ mpres_set (Vi, S, modulus); mpres_sqr (Vi1, S, modulus); mpres_sub (Vi1, Vi1, V0, modulus); j >>= 1; while (j > 1) { if ((uk & j) != 0UL) { /* i' = 2i + 1. V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */ mpres_mul (Vi, Vi, Vi1, modulus); mpres_sub (Vi, Vi, S, modulus); mpres_sqr (Vi1, Vi1, modulus); mpres_sub (Vi1, Vi1, V0, modulus); } else { /* i' = 2i. V_{i'} = V_{2i} = {V_i}^2 - V0. V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */ mpres_mul (Vi1, Vi, Vi1, modulus); mpres_sub (Vi1, Vi1, S, modulus); mpres_sqr (Vi, Vi, modulus); mpres_sub (Vi, Vi, V0, modulus); } j >>= 1; } /* Least significant bit of uk is always 1 */ mpres_mul (Vi, Vi, Vi1, modulus); mpres_sub (Vi, Vi, S, modulus); while (po2-- > 0) { mpres_sqr (Vi, Vi, modulus); mpres_sub (Vi, Vi, V0, modulus); } mpres_set (R, Vi, modulus); mpres_clear (Vi, modulus); mpres_clear (Vi1, modulus); mpres_clear (V0, modulus); if (0) { mpz_t tz; mpz_init (tz); mpres_get_z (tz, R, modulus); gmp_printf ("%Zd\n", tz); mpz_clear (tz); } } /* Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X) If R1 != NULL, stores U_{k+1}(S) there */ static void U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus) { mpres_t V0, Vi, Vi1, Ui, Ui1, t; unsigned long j, uk; if (k == 0L) { mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */ if (R1 != NULL) mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */ return; } uk = labs (k); if (uk == 1UL) { mpres_set_ui (R, 1UL, modulus); if (k == -1) mpres_neg (R, R, modulus); if (R1 != NULL) { if (k == -1) mpres_set_ui (R1, 0UL, modulus); else mpres_set (R1, S, modulus); /* U_2(S) = S */ } return; } if (0) { mpz_t tz; mpz_init (tz); mpres_get_z (tz, S, modulus); gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz); mpz_clear (tz); } mpres_init (V0, modulus); mpres_init (Vi, modulus); mpres_init (Vi1, modulus); mpres_init (Ui, modulus); mpres_init (Ui1, modulus); mpres_init (t, modulus); for (j = 1UL; j <= uk / 2UL; j <<= 1); mpres_set_ui (Ui, 1UL, modulus); /* Ui = U_1(S) = 1 */ mpres_set (Ui1, S, modulus); /* Ui1 = U_2(S) = S */ mpres_add (V0, Ui, Ui, modulus); /* V0 = V_0(S) = 2 */ mpres_set (Vi, S, modulus); /* Vi = V_1(S) = S */ mpres_sqr (Vi1, Vi, modulus); mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */ j >>= 1; /* i = 1 */ while (j != 0) { if ((uk & j) == 0UL) { mpres_mul (Vi1, Vi1, Vi, modulus); mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */ /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */ mpres_sub (t, Ui1, Ui, modulus); mpres_add (Ui1, Ui1, Ui, modulus); mpres_mul (Ui1, Ui1, t, modulus); mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */ mpres_sqr (Vi, Vi, modulus); mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */ } else { /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */ mpres_sub (t, Ui1, Ui, modulus); mpres_add (Ui, Ui, Ui1, modulus); mpres_mul (Ui, Ui, t, modulus); mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */ mpres_mul (Vi, Vi, Vi1, modulus); mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */ mpres_sqr (Vi1, Vi1, modulus); mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */ } j >>= 1; } if (k > 0) mpres_set (R, Ui, modulus); else mpres_neg (R, Ui, modulus); if (R1 != NULL) { /* Here k != -1,0,1, so k+1 is negative iff k is */ if (k > 0) mpres_set (R1, Ui1, modulus); else mpres_neg (R1, Ui1, modulus); } mpres_clear (V0, modulus); mpres_clear (Vi, modulus); mpres_clear (Vi1, modulus); mpres_clear (Ui, modulus); mpres_clear (Ui1, modulus); mpres_clear (t, modulus); if (0) { mpz_t tz; mpz_init (tz); mpres_get_z (tz, R, modulus); gmp_printf ("%Zd\n", tz); mpz_clear (tz); } } /* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q). For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q). The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in V1, Vk_1 and Vk, resp. The values of Vk_1 and Vk are clobbered. */ static void scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len, mpmod_t modulus, const mpres_t V1, mpres_t Vk_1, mpres_t Vk) { mpres_t Vt; unsigned long i; mpres_init (Vt, modulus); for (i = 0; i < len; i++) { mpres_mul_z_to_z (R[i], Vk, F[i], modulus); mpres_mul (Vt, Vk, V1, modulus); mpres_sub (Vt, Vt, Vk_1, modulus); mpres_set (Vk_1, Vk, modulus); /* Could be a swap */ mpres_set (Vk, Vt, modulus); /* Could be a swap */ } mpres_clear (Vt, modulus); } /* For a given reciprocal polynomial F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x), compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma If NTT is used, needs 4 * deg + 3 entries in tmp. If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal) */ static void list_scale_V (listz_t R, const listz_t F, const mpres_t Q, const unsigned long deg, mpmod_t modulus, listz_t tmp, const unsigned long tmplen, mpzspv_t dct, const mpzspm_t ntt_context) { mpres_t Vt; unsigned long i; const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2; const unsigned long newtmplen = tmplen - 4 * deg - 2; #ifdef WANT_ASSERT mpz_t leading; #endif if (deg == 0) { ASSERT(tmplen >= 1); mpz_mul (tmp[0], F[0], F[0]); mpz_mod (R[0], tmp[0], modulus->orig_modulus); return; } /* Make sure newtmplen does not underflow */ ASSERT_ALWAYS (tmplen >= 4 * deg + 2); #ifdef WANT_ASSERT mpz_init (leading); mpz_mul (leading, F[deg], F[deg]); mpz_mod (leading, leading, modulus->orig_modulus); #endif /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients of G(x). Square the symmetric G(x) polynomial. */ outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg); list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n", OUTPUT_TRACE); /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg. For i=0, V_0(Q) = 2, so G[0] = F[0], which leaves deg entries to process */ mpz_set (G[0], F[0]); #if defined(_OPENMP) #pragma omp parallel if (deg > 1000) #endif { const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); mpmod_t modulus_local; unsigned long l, start_i; mpres_t Vi, Vi_1; l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */ start_i = thread_nr * l + 1; l = MIN(l, deg + 1 - start_i); mpmod_init_set (modulus_local, modulus); mpres_init (Vi_1, modulus_local); mpres_init (Vi, modulus_local); V (Vi, Q, start_i, modulus_local); mpres_div_2exp (Vi, Vi, 1, modulus_local); V (Vi_1, Q, start_i - 1UL, modulus_local); mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local); scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local, Q, Vi_1, Vi); mpres_clear (Vi_1, modulus_local); mpres_clear (Vi, modulus_local); mpmod_clear (modulus_local); } list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n", OUTPUT_TRACE); /* Now square the G polynomial in G[0 .. deg], put result in G[0 .. 2*deg] */ /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */ for (i = 0; i <= deg; i++) if (mpz_sgn (G[i]) < 0) { mpz_add (G[i], G[i], modulus->orig_modulus); /* FIXME: make sure the absolute size does not "run away" */ if (mpz_sgn (G[i]) < 0) { outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i); mpz_mod (G[i], G[i], modulus->orig_modulus); } } if (dct != NULL && ntt_context != NULL) ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context); else list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus, newtmp, newtmplen); list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ", "\n", OUTPUT_TRACE); /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */ #if defined(_OPENMP) #pragma omp parallel if (deg > 1000) #endif { const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); mpmod_t modulus_local; unsigned long l, start_i; mpres_t Ui, Ui_1; l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */ start_i = thread_nr * l + 1UL; l = MIN(l, deg + 1 - start_i); mpmod_init_set (modulus_local, modulus); mpres_init (Ui_1, modulus_local); mpres_init (Ui, modulus_local); U (Ui_1, Ui, Q, start_i - 1, modulus_local); mpres_div_2exp (Ui, Ui, 1, modulus_local); mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local); scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local, Q, Ui_1, Ui); mpres_clear (Ui_1, modulus_local); mpres_clear (Ui, modulus_local); mpmod_clear (modulus_local); } /* Convert H to standard basis */ /* We can do it in-place with H - 1 = H_U. */ for (i = deg; i >= 3; i--) { mpz_add (H[i - 3], H[i - 3], H[i - 1]); if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0) mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus); } /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X), so no addition occures here */ /* if (deg >= 2) mpz_set (H[1], H[1]); Again, a no-op. */ /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */ /* mpz_set (H[0], H[0]); Another no-op. */ /* Now H[0 ... deg-1] contains the deg coefficients in standard basis of symmetric H(X) of degree 2*deg-2. */ list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n", OUTPUT_TRACE); /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg coefficents in standard basis in H[0 ... deg-1]) */ /* Bugfix: ks_multiply() does not like negative coefficients. */ for (i = 0; i <= deg; i++) if (mpz_sgn (H[i]) < 0) { mpz_add (H[i], H[i], modulus->orig_modulus); if (mpz_sgn (H[i]) < 0) { outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i); mpz_mod (H[i], H[i], modulus->orig_modulus); } } if (dct != NULL && ntt_context != NULL) ntt_sqr_reciprocal (H, H, dct, deg, ntt_context); else list_sqr_reciprocal (H, H, deg, modulus->orig_modulus, newtmp, newtmplen); /* Now there are the 2*deg-1 coefficients in standard basis of a symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */ list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n", OUTPUT_TRACE); /* Multiply by Q^2-4 */ mpres_init (Vt, modulus); mpres_sqr (Vt, Q, modulus); mpres_sub_ui (Vt, Vt, 4, modulus); #if defined(_OPENMP) #pragma omp parallel if (deg > 1000) { mpmod_t modulus_local; long i; /* OpenMP insists on signed loop iteration var :( */ mpmod_init_set (modulus_local, modulus); #pragma omp for for (i = 0; (unsigned long) i <= 2 * deg - 2; i++) mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local); mpmod_clear (modulus_local); } #else for (i = 0; (unsigned long) i <= 2 * deg - 2; i++) mpres_mul_z_to_z (H[i], Vt, H[i], modulus); #endif list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ " "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE); /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */ ASSERT (newtmplen > 0UL); if (deg == 1) { /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0 H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y) */ mpz_mul_2exp (newtmp[0], H[0], 1UL); mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */ mpz_sub (G[2], G[2], H[0]); } else if (deg == 2) { /* H(X) has degree 2*deg-2 = 2, , so H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2) H(X) * (X - 1/X)^2 = -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) + h_1 * V_3(Y) + h_2 * V_4(Y) */ mpz_sub (newtmp[0], H[0], H[2]); /* h_0 - h_2 */ mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */ mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */ mpz_add (G[1], G[1], H[1]); /* G[1] -= -h_1 */ mpz_sub (newtmp[0], newtmp[0], H[0]); /* h_0 - 2*h_2 */ mpz_sub (G[2], G[2], newtmp[0]); /* G[2] -= h_0 - 2*h_2 */ mpz_sub (G[3], G[3], H[1]); /* G[3] -= h_1 */ mpz_sub (G[4], G[4], H[2]); /* G[3] -= h_2 */ } else { /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then (x - 1/x)^2 H(X) = -2(h_0 - h_2) + (- h_1 + h_3) V_1(Y) + \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) + (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) + (h_{n-2} - 2h_n) V_n(Y) + h_{n-1} V_{n+1}(Y) + h_n V_{n+2}(Y) In our case, n = 2 * deg - 2 */ mpz_sub (newtmp[0], H[0], H[2]); mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */ mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */ mpz_add (G[1], G[1], H[1]); mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */ for (i = 2; i <= 2 * deg - 4; i++) { mpz_mul_2exp (newtmp[0], H[i], 1); mpz_sub (newtmp[0], newtmp[0], H[i - 2]); mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */ mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */ } for ( ; i <= 2 * deg - 2; i++) { mpz_mul_2exp (newtmp[0], H[i], 1UL); mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */ mpz_sub (G[i], G[i], newtmp[0]); } mpz_sub (G[i], G[i], H[i - 2]); mpz_sub (G[i + 1], G[i + 1], H[i - 1]); } for (i = 0; i <= 2 * deg; i++) mpz_mod (R[i], G[i], modulus->orig_modulus); if (test_verbose (OUTPUT_TRACE)) for (i = 0; i <= 2 * deg; i++) outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]); #ifdef WANT_ASSERT mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus); ASSERT (mpz_cmp (leading, R[2 * deg]) == 0); mpz_clear (leading); #endif mpres_clear (Vt, modulus); } #ifdef WANT_ASSERT /* Check if l is an (anti-)symmetric, possibly monic, polynomial. Returns -1 if it is (anti-)symmetric, or the smallest index i where l[i] != l[len - 1 + monic - i]) If anti == 1, the list is checked for symmetry, if it is -1, for antisymmetry. This function is used only if assertions are enabled. */ static long int ATTRIBUTE_UNUSED list_is_symmetric (listz_t l, unsigned long len, int monic, int anti, mpz_t modulus, mpz_t tmp) { unsigned long i; ASSERT (monic == 0 || monic == 1); ASSERT (anti == 1 || anti == -1); if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0) return 0L; if (monic && anti == -1) { mpz_sub_ui (tmp, modulus, 1); if (mpz_cmp (tmp, l[0]) != 0) return 0L; } for (i = monic; i < len / 2; i++) { if (anti == -1) { /* Negate (mod modulus) */ if (mpz_sgn (l[i]) == 0) { if (mpz_sgn (l[len - 1 + monic - i]) != 0) return (long) i; } else { mpz_sub (tmp, modulus, l[i]); if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0) return (long) i; } } else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0) return (long) i; } return -1L; } #endif /* Evaluate a polynomial of degree n-1 with all coefficients given in F[], or of degree n with an implicit leading 1 monomial not stored in F[], at x modulo modulus. Result goes in r. tmp needs 2 entries. */ ATTRIBUTE_UNUSED static void list_eval_poly (mpz_t r, const listz_t F, const mpz_t x, const unsigned long n, const int monic, const mpz_t modulus, listz_t tmp) { unsigned long i; mpz_set_ui (tmp[0], 1UL); mpz_set_ui (r, 0UL); for (i = 0UL; i < n; i++) { /* tmp[0] = x^i */ mpz_mul (tmp[1], F[i], tmp[0]); mpz_mod (tmp[1], tmp[1], modulus); mpz_add (r, r, tmp[1]); mpz_mul (tmp[1], tmp[0], x); mpz_mod (tmp[0], tmp[1], modulus); } if (monic) mpz_add (r, r, tmp[0]); mpz_mod (r, r, modulus); } /* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets". The parameter Q = r + 1/r. This code uses the fact that the polynomials are symmetric. Requires that the first set in "sets" has cardinality 2, all sets must be symmetric around 0. The resulting polynomial of degree 2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient f_i is stored in F[i], which therefore needs d+1 elements. */ static unsigned long poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets, listz_t tmp, const unsigned long tmplen, mpmod_t modulus, mpzspv_t dct, const mpzspm_t ntt_context) { unsigned long c, deg, i, nr; set_long_t *set = sets->sets; mpres_t Qt; ASSERT_ALWAYS (sets->nr > 0UL); ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of first set is 2 */ /* Check that first set is symmetric around 0 (we write card-1 instead of 1 to avoid a compiler warning with clang 2.9) */ ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]); if (test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, Q, modulus); outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t); mpz_clear (t); } mpres_init (Qt, modulus); outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2"); V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */ V (Qt, Qt, 2UL, modulus); /* Qt = V_2k(Q) */ mpres_neg (Qt, Qt, modulus); mpres_get_z (F[0], Qt, modulus); mpz_set_ui (F[1], 1UL); deg = 1UL; /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x = (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x = (x + 1/x) - V_{2k_1}(r + 1/r) */ for (nr = sets->nr - 1UL; nr > 0UL; nr--) { /* Assuming the sets are sorted in order of ascending cardinality, we process them back-to-front so the sets of cardinality 2 are processed last, but skipping the first set which we processed already. */ set = sets_nextset (sets->sets); /* Skip first set */ for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */ set = sets_nextset (set); /* Process this set. We assume it is either of cardinality 2, or of odd cardinality */ c = set->card; outputf (OUTPUT_DEVVERBOSE, " %lu", c); if (c == 2UL) { /* Check it's symmetric (we write c-1 instead of 2 to avoid a compiler warning with clang 2.9) */ ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]); V (Qt, Q, set->elem[0], modulus); V (Qt, Qt, 2UL, modulus); list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct, ntt_context); deg *= 2UL; ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */ } else { ASSERT_ALWAYS (c % 2UL == 1UL); ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL); /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials. Each is symmetric of degree 2*deg, so each has deg+1 coeffients in standard basis. */ for (i = 0UL; i < (c - 1UL) / 2UL; i++) { /* Check it's symmetric */ ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]); V (Qt, Q, set->elem[i], modulus); V (Qt, Qt, 2UL, modulus); ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */ list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg, modulus, tmp, tmplen, dct, ntt_context); ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg], 1UL) == 0); /* Check it's monic */ } /* Multiply the polynomials */ for (i = 0UL; i < (c - 1UL) / 2UL; i++) { /* So far, we have the product F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i, at F. This product has degree 2 * deg + i * 4 * deg, that is (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1 coefficients in F[0 ... (i * 2 + 1) * deg]. */ ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0); ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg], 1UL) == 0); list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1, "poly_from_sets_V: Multiplying ", "\n", OUTPUT_TRACE); list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL), 2UL * deg + 1UL, 0, 1, " and ", "\n", OUTPUT_TRACE); list_mul_reciprocal (F, F, (2UL * i + 1UL) * deg + 1UL, F + (2UL * i + 1UL) * (deg + 1UL), 2UL * deg + 1UL, modulus->orig_modulus, tmp, tmplen); list_mod (F, F, (2UL * i + 3UL) * deg + 1UL, modulus->orig_modulus); list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1, " = ", "\n", OUTPUT_TRACE); ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0); } deg *= c; } } mpres_clear (Qt, modulus); outputf (OUTPUT_DEVVERBOSE, ")"); return deg; } static int build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1, const faststage2_param_t *params, mpmod_t modulus) { mpzspm_t F_ntt_context; mpzspv_t F_ntt; unsigned long tmplen; listz_t tmp; long timestart, realstart; unsigned long i; timestart = cputime (); realstart = realtime (); /* Precompute the small primes, primitive roots and inverses etc. for the NTT. The code to multiply wants a 3*k-th root of unity, where k is the smallest power of 2 with k > s_1/2 */ F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1), modulus->orig_modulus); if (F_ntt_context == NULL) { outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, " "presumably out of memory\n"); return ECM_ERROR; } print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ", F_ntt_context); outputf (OUTPUT_VERBOSE, "Computing F from factored S_1"); tmplen = params->s_1 + 100; tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits)); F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context); i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt, F_ntt_context); ASSERT_ALWAYS(2 * i == params->s_1); ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]); outputf (OUTPUT_TRACE, "f(x) = f_0"); for (i = 1; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i); outputf (OUTPUT_TRACE, "/* PARI */ \n"); } clear_list (tmp, tmplen); tmp = NULL; mpzspv_clear (F_ntt, F_ntt_context); F_ntt = NULL; mpzspm_clear (F_ntt_context); F_ntt_context = NULL; return 0; } /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l. x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */ static void pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1, const unsigned long P, const long M_param, const unsigned long l_param, const mpz_t m_1, const long k_2, mpmod_t modulus_param, const mpzspm_t ntt_context) { mpres_t r[3], x_0, x_Mi; mpz_t t; unsigned long i; long timestart, realstart; long M = M_param; unsigned long l = l_param, offset = 0UL; mpmod_t modulus; int want_output = 1; outputf (OUTPUT_VERBOSE, "Computing g_i"); outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, " "l_param = %lu, m_1 = %Zd, k_2 = %lu\n", P, M_param, l_param, m_1, k_2); timestart = cputime (); realstart = realtime (); #ifdef _OPENMP #pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output) { /* When multi-threading, we adjust the parameters for each thread */ const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */ offset = thread_nr * l; outputf (OUTPUT_DEVVERBOSE, "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n", thread_nr, l, offset); ASSERT_ALWAYS (l_param >= offset); l = MIN(l, l_param - offset); M = M_param - (long) offset; /* Let only the master thread print stuff */ want_output = (thread_nr == 0); if (want_output) outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); #endif /* Make a private copy of the mpmod_t struct */ mpmod_init_set (modulus, modulus_param); mpz_init (t); mpres_init (r[0], modulus); mpres_init (r[1], modulus); mpres_init (r[2], modulus); mpres_init (x_0, modulus); mpres_init (x_Mi, modulus); if (want_output) { if (test_verbose (OUTPUT_TRACE)) { mpres_get_z (t, b_1, modulus); outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; " "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; " "m_1 = %Zd; /* PARI */\n", P, M, m_1); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n"); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = " "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2); } } /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */ mpz_set_ui (t, P); mpres_pow (r[0], b_1, t, modulus); /* r[0] = b_1^P = r */ if (test_verbose (OUTPUT_TRACE)) { mpres_get_z (t, r[0], modulus); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t); } /* FIXME: This is a huge mess, clean up some time */ mpz_set_si (t, M); mpz_neg (t, t); mpz_mul_2exp (t, t, 1UL); mpz_add_ui (t, t, 1UL); mpres_pow (r[1], r[0], t, modulus); /* r[1] = r^{2(-M+i)+1}, i = 0 */ mpz_set_si (t, M); mpz_mul (t, t, t); /* t = M^2 */ mpres_pow (r[2], r[0], t, modulus); /* r[2] = r^{(M-i)^2}, i = 0 */ mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */ mpz_mul_2exp (t, m_1, 1UL); mpz_add_ui (t, t, 1UL); mpz_mul_ui (t, t, P); mpz_add_si (t, t, k_2); mpz_add_si (t, t, k_2); if (want_output) outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == " "%Zd /* PARI C */\n", k_2, m_1, t); mpres_pow (x_0, b_1, t, modulus); /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */ if (want_output && test_verbose (OUTPUT_TRACE)) { mpres_get_z (t, x_0, modulus); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n", t); } mpz_set_si (t, M); mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */ mpres_invert (x_0, x_0, modulus); /* x_0 := x_0^{-1} now */ mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */ mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */ mpres_get_z (t, r[2], modulus); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n", offset, t); if (g_mpz != NULL) mpz_set (g_mpz[offset], t); if (g_ntt != NULL) mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context); /* So here we have for i = 0 r[2] = x_0^(M-i) * r^{(M-i)^2} r[1] = x_0^{-1} * r^{2(-M+i)+1} r[0] = r^2 t = r[2] */ for (i = 1; i < l; i++) { if (g_mpz != NULL) { mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1], modulus); outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;" " /* PARI */\n", offset + i, g_mpz[offset + i]); } if (g_ntt != NULL) { mpres_mul_z_to_z (t, r[1], t, modulus); if (g_mpz == NULL) /* Only one should be non-NULL... */ outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;" " /* PARI */\n", offset + i, t); mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context); } mpres_mul (r[1], r[1], r[0], modulus); } mpres_clear (r[0], modulus); mpres_clear (r[1], modulus); mpres_clear (r[2], modulus); mpres_clear (x_0, modulus); mpres_clear (x_Mi, modulus); mpz_clear (t); mpmod_clear (modulus); /* Clear our private copy of modulus */ #ifdef _OPENMP } #endif print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < l_param; i++) { outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^" "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i); } outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0"); for (i = 1; i < l; i++) outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i); outputf (OUTPUT_TRACE, " /* PARI */\n"); } } /* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9 of the paper. h == f is ok. */ static void pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r, const unsigned long d, mpmod_t modulus_parm, const mpzspm_t ntt_context) { mpres_t invr; /* r^{-1}. Can be shared between threads */ long timestart, realstart; mpres_init (invr, modulus_parm); mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for failure, even if theoretically impossible */ if (test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, r, modulus_parm); outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; " "r = Mod(%Zd, N); /* PARI */\n", modulus_parm->orig_modulus, t); mpz_clear (t); } outputf (OUTPUT_VERBOSE, "Computing h"); timestart = cputime (); realstart = realtime (); #ifdef _OPENMP #pragma omp parallel if (d > 100) #endif { mpres_t fd[3]; /* finite differences table for r^{-i^2}*/ mpz_t t; /* the h_j value as an mpz_t */ unsigned long j; unsigned long offset = 0UL, len = d; mpmod_t modulus; /* Adjust offset and length for this thread */ #ifdef _OPENMP { const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); unsigned long chunklen; if (thread_nr == 0) outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL; offset = chunklen * (unsigned long) thread_nr; len = MIN(chunklen, len - offset); } #endif mpmod_init_set (modulus, modulus_parm); mpres_init (fd[0], modulus); mpres_init (fd[1], modulus); mpres_init (fd[2], modulus); mpz_init (t); /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */ /* r^{-2} in fd[0] is constant and could be shared. Computing it separately in each thread has the advantage of putting it in local memory. May not make much difference overall */ mpres_sqr (fd[0], invr, modulus); /* fd[0] = r^{-2} */ mpz_set_ui (t, offset); mpz_mul_2exp (t, t, 1UL); mpz_add_ui (t, t, 1UL); /* t = 2 * offset + 1 */ mpres_pow (fd[1], invr, t, modulus); /* fd[1] = r^{-(2*offset+1)} */ mpz_set_ui (t, offset); mpz_mul (t, t, t); /* t = offset^2 */ mpres_pow (fd[2], invr, t, modulus); /* fd[2] = r^{-offset^2} */ /* Generate the sequence */ for (j = offset; j < offset + len; j++) { mpres_mul_z_to_z (t, fd[2], f[j], modulus); outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t); if (h != NULL) mpz_set (h[j], t); if (h_ntt != NULL) mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context); mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */ mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */ } mpres_clear (fd[2], modulus); mpres_clear (fd[1], modulus); mpres_clear (fd[0], modulus); mpz_clear (t); mpmod_clear (modulus); } mpres_clear (invr, modulus_parm); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (test_verbose (OUTPUT_TRACE)) { unsigned long j; for (j = 0; j < d; j++) outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == " "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j); outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0"); for (j = 1; j < d; j++) outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j); outputf (OUTPUT_TRACE, " /* PARI */\n"); } } static int make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2, const faststage2_param_t *params) { unsigned long i; sets_long_t *facS_2; size_t facS_2_size; *S_1 = sets_get_factored_sorted (params->P); if (*S_1 == NULL) return ECM_ERROR; { mpz_t t1, t2; mpz_init (t1); mpz_init (t2); sets_sumset_minmax (t1, *S_1, 1); sets_max (t2, params->P); ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0); mpz_clear (t1); mpz_clear (t2); } *S_2 = malloc (set_sizeof(params->s_2)); if (*S_2 == NULL) { free (*S_1); return ECM_ERROR; } /* Extract sets for S_2 and compute the set of sums */ sets_extract (NULL, &facS_2_size, *S_1, params->s_2); facS_2 = malloc (facS_2_size); if (facS_2 == NULL) { free (*S_1); free (*S_2); return ECM_ERROR; } sets_extract (facS_2, NULL, *S_1, params->s_2); sets_sumset (*S_2, facS_2); ASSERT_ALWAYS ((*S_2)->card == params->s_2); free (facS_2); quicksort_long ((*S_2)->elem, (*S_2)->card); /* Print the sets in devverbose mode */ if (test_verbose (OUTPUT_DEVVERBOSE)) { outputf (OUTPUT_DEVVERBOSE, "S_1 = "); sets_print (OUTPUT_DEVVERBOSE, *S_1); outputf (OUTPUT_DEVVERBOSE, "S_2 = {"); for (i = 0UL; i + 1UL < params->s_2; i++) outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]); if (i < params->s_2) outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]); outputf (OUTPUT_DEVVERBOSE, "}\n"); } return 0; } ATTRIBUTE_UNUSED static mpzspv_t * mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm) { int i; /* OpenMP wants the iteration variable a signed type */ mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *)); if (x == NULL) return NULL; for (i = 0; i < (int) mpzspm->sp_num; i++) x[i] = NULL; #ifdef _OPENMP #pragma omp parallel private(i) shared(x) { #pragma omp for #endif for (i = 0; i < (int) mpzspm->sp_num; i++) x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t)); #ifdef _OPENMP } #endif for (i = 0; i < (int) mpzspm->sp_num; i++) if (x[i] == NULL) break; if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */ { for (i = 0; i < (int) mpzspm->sp_num; i++) if (x[i] != NULL) sp_aligned_free(x[i]); return NULL; } #if 0 if (test_verbose (OUTPUT_DEVVERBOSE)) { spv_t * last = x[0]; printf ("mpzspv_init_mt: x[0] = %p\n", x[0]); for (i = 1; i < (int) mpzspm->sp_num; i++) printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n", i, x[i], (long) (x[i] - x[i-1])); } #endif return x; } ATTRIBUTE_UNUSED static void ntt_print_vec (const char *msg, const spv_t spv, const spv_size_t l) { spv_size_t i; /* Warning: on some computers, for example gcc49.fsffrance.org, "unsigned long" might be shorter than "sp_t" */ gmp_printf ("%s [%Nd", msg, (mp_ptr) spv, 1); for (i = 1; i < l; i++) gmp_printf (", %Nd", (mp_ptr) spv + i, 1); printf ("]\n"); } /* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2. S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}). S[i] contains the n coefficients s_i, 0 <= i <= n-1. R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}). dft must have power of 2 length len >= 2n. The NTT primes must be == 1 (mod 3*len). */ #undef TRACE_ntt_sqr_reciprocal static void ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft, const spv_size_t n, const mpzspm_t ntt_context) { #ifdef WANT_ASSERT mpz_t S_eval_1, R_eval_1; #endif if (n == 0) return; if (n == 1) { mpz_mul (R[0], S[0], S[0]); mpz_mod (R[0], R[0], ntt_context->modulus); return; } #ifdef WANT_ASSERT mpz_init (S_eval_1); list_recip_eval1 (S_eval_1, S, n); /* Compute (S(1))^2 */ mpz_mul (S_eval_1, S_eval_1, S_eval_1); mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus); #endif #ifdef TRACE_ntt_sqr_reciprocal printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len); gmp_printf ("Input polynomial is %Zd", S[0]); { int j; for (j = 1; (spv_size_t) j < n; j++) gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j); } printf ("\n"); #endif /* Fill NTT elements [0 .. n-1] with coefficients */ mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context); mpzspv_sqr_reciprocal (dft, n, ntt_context); #if defined(_OPENMP) #pragma omp parallel if (n > 50) #endif { spv_size_t i, offset = 0, chunklen = 2*n - 1; #if defined(_OPENMP) { const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1; offset = (spv_size_t) thread_nr * chunklen; if (2*n - 1 > offset) chunklen = MIN(chunklen, (2*n - 1) - offset); else chunklen = 0UL; } #endif mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context); for (i = offset; i < offset + chunklen; i++) mpz_mod (R[i], R[i], ntt_context->modulus); } #ifdef TRACE_ntt_sqr_reciprocal gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]); for (j = 1; (spv_size_t) j < 2*n - 1; j++) gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j); printf ("\n"); #endif #ifdef WANT_ASSERT mpz_init (R_eval_1); /* Compute (S^2)(1) and compare to (S(1))^2 */ list_recip_eval1 (R_eval_1, R, 2 * n - 1); mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus); if (mpz_cmp (R_eval_1, S_eval_1) != 0) { gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but " "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1); #if 0 gmp_printf ("Output polynomial is %Zd", R[0]); for (j = 1; (spv_size_t) j < 2*n - 1; j++) gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j); printf ("\n"); #endif abort (); } mpz_clear (S_eval_1); mpz_clear (R_eval_1); #endif } /* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N), the NTT residues are converted to integer residues (mod N) first. If add == NULL, add[i] is assumed to be 0. */ static void ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset, const listz_t add, const unsigned long len_param, const mpzspm_t ntt_context, mpmod_t modulus_param) { unsigned long i, j; const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE; listz_t R; unsigned long len = len_param, thread_offset = 0; mpres_t tmpres, tmpprod, totalprod; mpmod_t modulus; long timestart, realstart; outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N"); timestart = cputime (); realstart = realtime (); /* All the threads will multiply their partial products to this one. */ mpres_init (totalprod, modulus_param); mpres_set_ui (totalprod, 1UL, modulus_param); #ifdef _OPENMP #pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod) { const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); len = (len_param - 1) / nr_chunks + 1; thread_offset = thread_nr * len; ASSERT (len_param >= thread_offset); len = MIN(len, len_param - thread_offset); #pragma omp master { outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); } #endif /* Make a private copy of the mpmod_t struct */ mpmod_init_set (modulus, modulus_param); MEMORY_TAG; R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) * GMP_NUMB_BITS); MEMORY_UNTAG; mpres_init (tmpres, modulus); mpres_init (tmpprod, modulus); mpres_set_ui (tmpprod, 1UL, modulus); for (i = 0; i < len; i += Rlen) { const unsigned long blocklen = MIN(len - i, Rlen); /* Convert blocklen residues from NTT to integer representatives and store them in R */ mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen, ntt_context); /* Accumulate product in tmpprod */ for (j = 0; j < blocklen; j++) { outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]); if (add != NULL) mpz_add (R[j], R[j], add[i + thread_offset + j]); mpres_set_z_for_gcd (tmpres, R[j], modulus); #define TEST_ZERO_RESULT #ifdef TEST_ZERO_RESULT if (mpres_is_zero (tmpres, modulus)) outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i); #endif mpres_mul (tmpprod, tmpprod, tmpres, modulus); } } #ifdef _OPENMP #pragma omp critical { mpres_mul (totalprod, totalprod, tmpprod, modulus); } #else mpres_set (totalprod, tmpprod, modulus); #endif mpres_clear (tmpres, modulus); mpres_clear (tmpprod, modulus); mpmod_clear (modulus); clear_list (R, Rlen); #ifdef _OPENMP } #endif if (product != NULL) mpres_get_z (*product, totalprod, modulus_param); mpres_gcd (f, totalprod, modulus_param); mpres_clear (totalprod, modulus_param); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); } int pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus, const faststage2_param_t *params) { unsigned long phiP, nr; unsigned long i, l, lenF, lenG, lenR, tmplen; sets_long_t *S_1; /* This is stored as a set of sets (arithmetic progressions of prime length */ set_long_t *S_2; /* This is stored as a regular set */ listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has degree s_1. It is symmetric, so has only s_1 / 2 + 1 distinct coefficients. The sequence h_j will be stored in the same memory and won't be a monic polynomial, so the leading 1 monomial of F will be stored explicitly. Hence we need s_1 / 2 + 1 entries. */ listz_t g, h, tmp, R; mpz_t mt; /* All-purpose temp mpz_t */ mpres_t mr; /* All-purpose temp mpres_t */ int youpi = ECM_NO_FACTOR_FOUND; long timetotalstart, realtotalstart, timestart; timetotalstart = cputime (); realtotalstart = realtime (); phiP = eulerphi (params->P); ASSERT_ALWAYS (phiP == params->s_1 * params->s_2); ASSERT_ALWAYS (params->s_1 < params->l); nr = params->l - params->s_1; /* Number of points we evaluate */ if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR) return ECM_ERROR; /* Allocate all the memory we'll need */ /* Allocate the correct amount of space for each mpz_t or the reallocations will up to double the time for stage 2! */ mpz_init (mt); mpres_init (mr, modulus); lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores the leading 1 monomial for each factor */ F = init_list2 (lenF, (unsigned int) abs (modulus->bits)); h = malloc ((params->s_1 + 1) * sizeof (mpz_t)); if (h == NULL) { fprintf (stderr, "Cannot allocate memory in pm1fs2\n"); exit (1); } lenG = params->l; g = init_list2 (lenG, (unsigned int) abs (modulus->bits)); lenR = nr; R = init_list2 (lenR, (unsigned int) abs (modulus->bits)); tmplen = 3UL * params->l + list_mul_mem (params->l / 2); outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen); if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen) { tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12; /* FIXME: It appears TMulGen_space() returns a too small value! */ outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n", tmplen); } #ifdef SHOW_TMP_USAGE tmp = init_list (tmplen); #else tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits)); #endif mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */ outputf (OUTPUT_TRACE, "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, mt); /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */ outputf (OUTPUT_VERBOSE, "Computing F from factored S_1"); timestart = cputime (); /* First compute X + 1/X */ mpres_invert (mr, X, modulus); mpres_add (mr, mr, X, modulus); i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL); ASSERT_ALWAYS(2 * i == params->s_1); ASSERT(mpz_cmp_ui (F[i], 1UL) == 0); free (S_1); S_1 = NULL; outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]); outputf (OUTPUT_TRACE, "f(x) = f_0"); for (i = 1; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i); outputf (OUTPUT_TRACE, "/* PARI */ \n"); } mpz_set_ui (mt, params->P); mpres_pow (mr, X, mt, modulus); /* mr = X^P */ pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL); /* Make a symmetric copy of F in h. It will have length s_1 + 1 = 2*lenF - 1 */ /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */ for (i = 0; i < params->s_1 / 2 + 1; i++) *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */ for (i = 0; i < params->s_1 / 2; i++) *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]); if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < params->s_1 + 1; i++) outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]); outputf (OUTPUT_VERBOSE, "h(x) = h_0"); for (i = 1; i < params->s_1 + 1; i++) outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i); outputf (OUTPUT_VERBOSE, " /* PARI */\n"); } for (l = 0; l < params->s_2; l++) { const unsigned long M = params->l - 1L - params->s_1 / 2L; outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", l + 1, params->s_2); pm1_sequence_g (g, NULL, X, params->P, M, params->l, params->m_1, S_2->elem[l], modulus, NULL); /* Do the convolution */ /* Use the transposed "Middle Product" algorithm */ /* TMulGen reverses the first input sequence, but that doesn't matter since h is symmetric. */ outputf (OUTPUT_VERBOSE, "TMulGen of g and h"); timestart = cputime (); ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1)); /* Computes rev(h)*g, stores coefficients of x^(s_1) to x^(s_1+nr-1) = x^(len-1) */ if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp, modulus->orig_modulus) < 0) { outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out " "of memory)\n"); youpi = ECM_ERROR; break; } list_mod (R, R, nr, modulus->orig_modulus); outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); #if 0 && defined(WANT_ASSERT) /* See if R[i] is correct, with a test that works even if i0 != 0 */ /* More expensive self-test */ /* alpha = beta*(i0 + l*nr) */ /* This code is old and probably does not work. */ outputf (OUTPUT_VERBOSE, "Verifying all results (slow)"); for (i = 0; i < nr; i++) { mpz_set_ui (mt, nr * l); mpz_add (mt, mt, root_params->i0); mpz_add_ui (mt, mt, i); mpz_mul_ui (mt, mt, beta); mpres_get_z (tmp[0], X, modulus); mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* Hence, tmp[0] = X^(alpha + i * beta) */ list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus, tmp + 2); mpz_set_ui (mt, i); mpz_mul_ui (mt, mt, i); mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */ mpres_get_z (tmp[0], X, modulus); mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */ mpz_mul (tmp[0], tmp[0], R[i]); mpz_mod (tmp[0], tmp[0], modulus->orig_modulus); if (mpz_cmp (tmp[0], tmp[1]) != 0) { outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i); outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]); abort (); } } outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n"); #endif if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < nr; i++) outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]); } outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)"); timestart = cputime (); { mpres_t tmpres, tmpprod; mpres_init (tmpres, modulus); mpres_init (tmpprod, modulus); mpres_set_z_for_gcd (tmpprod, R[0], modulus); for (i = 1; i < nr; i++) { mpres_set_z_for_gcd (tmpres, R[i], modulus); mpres_mul (tmpprod, tmpprod, tmpres, modulus); } mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */ mpres_gcd (tmp[0], tmpprod, modulus); mpres_clear (tmpprod, modulus); mpres_clear (tmpres, modulus); } outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd (times some " "power of 2 if REDC was used! Try -mpzmod)\n", tmp[1]); if (mpz_cmp_ui (tmp[0], 1UL) > 0) { mpz_set (f, tmp[0]); youpi = ECM_FACTOR_FOUND_STEP2; break; } } #ifdef SHOW_TMP_USAGE for (i = tmplen - 1; i > 0; i--) if (tmp[i]->_mp_alloc > 1) break; outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i); #endif free (S_2); free (h); clear_list (F, lenF); clear_list (g, lenG); clear_list (R, lenR); clear_list (tmp, tmplen); mpz_clear (mt); mpres_clear (mr, modulus); outputf (OUTPUT_NORMAL, "Step 2"); /* In normal output mode, print only cpu time as we always have. In verbose mode, print real time as well if we used multi-threading */ if (test_verbose (OUTPUT_VERBOSE)) print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart); else print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L); return youpi; } int pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus, const faststage2_param_t *params) { unsigned long nr; unsigned long l, lenF; sets_long_t *S_1; /* This is stored as a set of sets (arithmetic progressions of prime length */ set_long_t *S_2; /* This is stored as a regular set */ listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has degree s_1. It is symmetric, so has only s_1 / 2 + 1 distinct coefficients. The sequence h_j will be stored in the same memory and won't be a monic polynomial, so the leading 1 monomial of F will be stored explicitly. Hence we need s_1 / 2 + 1 entries. */ mpzspm_t ntt_context; mpzspv_t g_ntt, h_ntt; mpz_t mt; /* All-purpose temp mpz_t */ mpz_t product; /* Product of each multi-point evaluation */ mpz_t *product_ptr = NULL; mpres_t tmpres; /* All-purpose temp mpres_t */ int youpi = ECM_NO_FACTOR_FOUND; long timetotalstart, realtotalstart, timestart, realstart; timetotalstart = cputime (); realtotalstart = realtime (); ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2); ASSERT_ALWAYS (params->s_1 < params->l); nr = params->l - params->s_1; /* Number of points we evaluate */ /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution with g. We need NTT of transform length l. We do it here at the start of stage 2 so that in case of a "not enough primes" condition, we don't have to wait until after F is built to get the error. */ ntt_context = mpzspm_init (params->l, modulus->orig_modulus); if (ntt_context == NULL) { outputf (OUTPUT_ERROR, "Could not initialise ntt_context, " "presumably out of memory\n"); return ECM_ERROR; } print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ", ntt_context); if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR) return ECM_ERROR; /* Allocate all the memory we'll need for building f */ mpz_init (mt); mpres_init (tmpres, modulus); lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores the leading 1 monomial for each factor */ F = init_list2 (lenF, (unsigned int) abs (modulus->bits)); mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */ outputf (OUTPUT_TRACE, "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, mt); #if 0 && defined (WANT_ASSERT) /* For this self test run with a large enough B2 so that enough memory is allocated for tmp and F_ntt, otherwise it segfaults. */ { int testlen = 255; int i, j; /* A test of ntt_sqr_reciprocal() */ for (j = 1; j <= testlen; j++) { outputf (OUTPUT_VERBOSE, "Testing ntt_sqr_reciprocal() for input degree %d\n", j - 1); for (i = 0; i < j; i++) mpz_set_ui (tmp[i], 1UL); ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F); for (i = 0; i < 2 * j - 1; i++) { ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0); } } outputf (OUTPUT_VERBOSE, "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n", testlen - 1); } #endif /* First compute X + 1/X */ mpres_invert (tmpres, X, modulus); mpres_add (tmpres, tmpres, X, modulus); if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR) { free (S_1); free (S_2); mpz_clear (mt); mpres_clear (tmpres, modulus); mpzspm_clear (ntt_context); clear_list (F, lenF); return ECM_ERROR; } free (S_1); S_1 = NULL; h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context); mpz_set_ui (mt, params->P); mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */ pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus, ntt_context); clear_list (F, lenF); g_ntt = mpzspv_init (params->l, ntt_context); /* Compute the DCT-I of h */ outputf (OUTPUT_VERBOSE, "Computing DCT-I of h"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1, g_ntt, ntt_context); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_init (product); product_ptr = &product; } for (l = 0; l < params->s_2; l++) { const unsigned long M = params->l - 1L - params->s_1 / 2L; outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", l + 1, params->s_2); /* Compute the coefficients of the polynomial g(x) */ pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l, params->m_1, S_2->elem[l], modulus, ntt_context); /* Do the convolution */ outputf (OUTPUT_VERBOSE, "Computing g*h"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); /* Compute GCD of N and coefficients of product polynomial */ ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context, modulus); outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd (times some " "power of 2 if REDC was used! Try -mpzmod)\n", product); /* If we found a factor, stop */ if (mpz_cmp_ui (mt, 1UL) > 0) { mpz_set (f, mt); youpi = ECM_FACTOR_FOUND_STEP2; break; } } if (test_verbose (OUTPUT_RESVERBOSE)) { product_ptr = NULL; mpz_clear (product); } mpzspv_clear (g_ntt, ntt_context); mpzspv_clear (h_ntt, ntt_context); mpzspm_clear (ntt_context); mpres_clear (tmpres, modulus); mpz_clear (mt); free (S_2); outputf (OUTPUT_NORMAL, "Step 2"); /* In normal output mode, print only cpu time as we always have. In verbose mode, print real time as well if we used multi-threading */ if (test_verbose (OUTPUT_VERBOSE)) print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart); else print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L); return youpi; } static void gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus, const int verbose) { mpz_t t1, t2; if (!test_verbose (verbose)) return; mpz_init (t1); mpz_init (t2); mpres_get_z (t1, r_x, modulus); mpres_get_z (t2, r_y, modulus); outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2); mpz_clear (t1); mpz_clear (t2); } /* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta)) using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)). a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1], t[2] and Delta must not overlap with anything. */ /* FIXME: is there a faster multiplication routine if both inputs have norm 1? */ static void gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1, const mpres_t b_0, const mpres_t b_1, const mpres_t Delta, mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen, mpres_t *tmp) { ASSERT (tmplen >= 2); if (0 && test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, Delta, modulus); outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); " "N = %Zd; /* PARI */\n", t, modulus->orig_modulus); mpz_clear (t); outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ ("); gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, ") * ("); gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE); } mpres_add (tmp[0], a_0, a_1, modulus); mpres_add (tmp[1], b_0, b_1, modulus); mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) = a_0*b_0 + a_0*b_1 + a_1*b_0 + a_1*b_1 */ mpres_mul (r_0, a_0, b_0, modulus); /* r_0 = a_0*b_0. We don't need a_0 or b_0 any more now */ mpres_sub (tmp[1], tmp[1], r_0, modulus); /* t[1] = a_0*b_1 + a_1*b_0 + a_1*b_1 */ mpres_mul (tmp[0], a_1, b_1, modulus); /* t[0] = a_1*b_1. We don't need a_1 or b_1 any more now */ mpres_sub (r_1, tmp[1], tmp[0], modulus); /* r_1 == a_0*b_1 + a_1*b_0 */ mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */ mpres_add (r_0, r_0, tmp[0], modulus); /* r_0 = a_0*b_0 + a_1*b_1*Delta */ if (0 && test_verbose (OUTPUT_TRACE)) { outputf (OUTPUT_TRACE, ") == "); gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1. a_0 and r_0, as well as a_1 and r_1 may overlap */ static void gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1, mpmod_t modulus) { ASSERT (a_0 != r_1); /* a_0 is read after r_1 is written */ if (pari) gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1); mpres_mul (r_1, a_0, a_1, modulus); mpres_add (r_1, r_1, r_1, modulus); /* r_1 = 2*a_0*a_1 */ mpres_sqr (r_0, a_0, modulus); mpres_add (r_0, r_0, r_0, modulus); mpres_sub_ui (r_0, r_0, 1UL, modulus); /* r_0 = 2*a_0^2 - 1 */ if (pari) gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1); } /* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int. (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e. a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)). a0, a1, r0 and r1 must not overlap */ static void gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0, const mpres_t a1, const long e, const mpres_t Delta, mpmod_t modulus, unsigned long tmplen, mpres_t *tmp) { const unsigned long abs_e = labs (e); unsigned long mask = ~0UL - (~0UL >> 1); ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1); if (e == 0) { mpres_set_ui (r0, 1UL, modulus); mpres_set_ui (r1, 0UL, modulus); return; } /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with a0 - a1*sqrt(Delta), we get (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator is the norm which is known to be 1, so the result is a0 - a1*sqrt(Delta). */ while ((abs_e & mask) == 0UL) mask >>= 1; mpres_set (r0, a0, modulus); mpres_set (r1, a1, modulus); while (mask > 1UL) { gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus); mask >>= 1; if (abs_e & mask) gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp); } if (e < 0) mpres_neg (r1, r1, modulus); if (0 && test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, Delta, modulus); outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); " "N = %Zd; /* PARI */\n", t, modulus->orig_modulus); mpz_clear (t); outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ ("); gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, ")^(%ld) == ", e); gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Same, but taking an mpz_t argument for the exponent */ static void gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0, const mpres_t a1, mpz_t e, const mpres_t Delta, mpmod_t modulus, unsigned long tmplen, mpres_t *tmp) { mpz_t abs_e; unsigned long idx; ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1); if (mpz_sgn (e) == 0) { mpres_set_ui (r0, 1UL, modulus); mpres_set_ui (r1, 0UL, modulus); return; } mpz_init (abs_e); mpz_abs (abs_e, e); idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus mpz_tstbit (abs_e, idx) == 1 */ ASSERT (mpz_tstbit (abs_e, idx) == 1); mpres_set (r0, a0, modulus); mpres_set (r1, a1, modulus); while (idx > 0UL) { gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus); idx--; if (mpz_tstbit (abs_e, idx)) gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp); } if (mpz_sgn (e) < 0) mpres_neg (r1, r1, modulus); mpz_clear (abs_e); if (test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, Delta, modulus); outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); " "N = %Zd; /* PARI */\n", t, modulus->orig_modulus); mpz_clear (t); outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ ("); gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, ")^(%Zd) == ", e); gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an element of norm 1 in the quadratic extension ring */ ATTRIBUTE_UNUSED static void gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y, const long k, const unsigned long l, const mpres_t Delta, mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp) { mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4, *V2 = origtmp + 6; const unsigned long newtmplen = origtmplen - 7; mpres_t *newtmp = origtmp + 7; unsigned long i; if (l == 0UL) return; ASSERT (origtmplen >= 8UL); if (pari) gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n", a_x, a_y, modulus->orig_modulus); /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use v[0] and v[1] as temp storage */ gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen, newtmp); gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus, newtmplen, newtmp); if (pari) gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N " "/* PARI C */\n", k, r_x[0], r_y[0]); /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/ if (l > 1) { /* v[0] + v[1]*sqrt(Delta) still contains a^k */ gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus); /* Now r[1] = a^(2k) */ gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta, modulus, newtmplen, newtmp); /* Now r[1] = a^(k^2 + 2k) */ gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus, newtmplen, newtmp); /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */ } if (pari) gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N " "/* PARI C */\n", k + 1, r_x[1], r_y[1]); /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */ gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus); gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus, newtmplen, newtmp); if (pari) gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N " "/* PARI C */\n", k, r2_x[0], r2_y[0]); /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */ gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus, newtmplen, newtmp); if (pari) gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N " "/* PARI C */\n", k + 1, r2_x[1], r2_y[1]); /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x. V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */ mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a = 2*a_x*/ V (v[0], *V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (a + 1/a) */ V (v[1], *V2, 2 * k + 3, modulus); /* v[0] = V_{2k+3} (a + 1/a) */ mpres_sqr (*V2, *V2, modulus); /* V2 = 4*a_x^2 */ mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */ if (pari) { gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == " "%Zd %% N /* PARI C */\n", *V2); gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N " "/* PARI C */\n", 2 * k + 1, v[0]); gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N " "/* PARI C */\n", 2 * k + 3, v[1]); } /* Compute the remaining a^((k+i)^2) values according to Peter's recurrence */ for (i = 2; i < l; i++) { /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken modulo 2 */ mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus); mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus); mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus); mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus); /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */ mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus); mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus); /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */ mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus); mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus); if (pari) gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N " "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]); } } /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l. x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */ static void pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt, const mpres_t b1_x, const mpres_t b1_y, const unsigned long P, const mpres_t Delta, const long M_param, const unsigned long l_param, const mpz_t m_1, const long k_2, const mpmod_t modulus_param, const mpzspm_t ntt_context) { const unsigned long tmplen = 3; const int want_x = (g_x != NULL || g_x_ntt != NULL); const int want_y = (g_y != NULL || g_y_ntt != NULL); mpres_t r_x, r_y, x0_x, x0_y, v2, r1_x[2], r1_y[2], r2_x[2], r2_y[2], v[2], tmp[3]; mpz_t mt; mpmod_t modulus; /* Thread-local copy of modulus_param */ unsigned long i, l = l_param, offset = 0; long M = M_param; long timestart, realstart; int want_output = 1; outputf (OUTPUT_VERBOSE, "Computing %s%s%s", (want_x) ? "g_x" : "", (want_x && want_y) ? " and " : "", (want_y) ? "g_y" : ""); timestart = cputime (); realstart = realtime (); #ifdef _OPENMP #pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output) { /* When multi-threading, we adjust the parameters for each thread */ const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); l = (l_param - 1) / nr_chunks + 1; offset = thread_nr * l; ASSERT_ALWAYS (l_param >= offset); l = MIN(l, l_param - offset); M = M_param - (long) offset; want_output = (omp_get_thread_num() == 0); if (want_output) outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); #endif mpmod_init_set (modulus, modulus_param); mpres_init (r_x, modulus); mpres_init (r_y, modulus); mpres_init (x0_x, modulus); mpres_init (x0_y, modulus); mpres_init (v2, modulus); for (i = 0; i < 2UL; i++) { mpres_init (r1_x[i], modulus); mpres_init (r1_y[i], modulus); mpres_init (r2_x[i], modulus); mpres_init (r2_y[i], modulus); mpres_init (v[i], modulus); } for (i = 0; i < tmplen; i++) mpres_init (tmp[i], modulus); mpz_init (mt); if (want_output && test_verbose (OUTPUT_TRACE)) { mpres_get_z (mt, Delta, modulus); outputf (OUTPUT_TRACE, "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; " "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n", mt, P, M, k_2, m_1, modulus->orig_modulus); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = "); gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, "; /* PARI */\n"); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n"); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ " "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n"); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n"); } /* Compute r */ gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus, tmplen, tmp); if (want_output && test_verbose (OUTPUT_TRACE)) { outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == "); gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } /* Compute x0 = x_0 */ mpz_mul_2exp (mt, m_1, 1UL); mpz_add_ui (mt, mt, 1UL); mpz_mul_ui (mt, mt, P); mpz_add_si (mt, mt, k_2); mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */ gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus, tmplen, tmp); if (want_output && test_verbose (OUTPUT_TRACE)) { outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == "); gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M. We use v[0,1] as temporary storage */ gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus, tmplen, tmp); /* v[0,1] = r^M */ gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus, tmplen, tmp); /* v[0,1] = r^M * x_0 */ gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus, tmplen, tmp); /* r1[0] = (r^M * x_0)^M */ if (g_x != NULL) mpres_get_z (g_x[offset], r1_x[0], modulus); if (g_y != NULL) mpres_get_z (g_y[offset], r1_y[0], modulus); if (g_x_ntt != NULL) { mpres_get_z (mt, r1_x[0], modulus); mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context); } if (g_y_ntt != NULL) { mpres_get_z (mt, r1_y[0], modulus); mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context); } /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1). We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */ gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus, tmplen, tmp); gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus, tmplen, tmp); gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta, modulus, tmplen, tmp); if (g_x != NULL) mpres_get_z (g_x[offset + 1], r1_x[1], modulus); if (g_y != NULL) mpres_get_z (g_y[offset + 1], r1_y[1], modulus); if (g_x_ntt != NULL) { mpres_get_z (mt, r1_x[1], modulus); mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context); } if (g_y_ntt != NULL) { mpres_get_z (mt, r1_y[1], modulus); mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context); } /* x0 := $x_0 * r^{2M - 3}$ */ /* We don't need x0 after this so we overwrite it. We use v[0,1] as temp storage for $r^{2M - 3}$. */ gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus, tmplen, tmp); gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus, tmplen, tmp); /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */ /* We only need $r^2$ from here on, so we set r = $r^2$ */ gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus); gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta, modulus, tmplen, tmp); gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta, modulus, tmplen, tmp); /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */ mpres_add (v[1], x0_x, x0_x, modulus); /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */ gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus, tmplen, tmp); /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */ mpres_add (v[0], x0_x, x0_x, modulus); /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */ mpres_add (v2, r_x, r_x, modulus); /* We don't need the contents of r any more and use it as a temp var */ for (i = 2; i < l; i++) { if (want_x) { /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken modulo 2. We store the new r1_x[i] in r_x for now */ mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus); mpres_sub (r_x, r_x, r2_x[i % 2], modulus); /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */ mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (r2_x[i % 2], r2_x[i % 2], r1_x[i % 2], modulus); mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */ if (g_x != NULL) mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */ if (g_x_ntt != NULL) { mpres_get_z (mt, r_x, modulus); mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context); } } if (want_y) { /* Same for y coordinate */ mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus); mpres_sub (r_y, r_y, r2_y[i % 2], modulus); mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (r2_y[i % 2], r2_y[i % 2], r1_y[i % 2], modulus); mpres_set (r1_y[i % 2], r_y, modulus); if (g_y != NULL) mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */ if (g_y_ntt != NULL) { mpres_get_z (mt, r_y, modulus); mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context); } } /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */ mpres_mul (r_x, v[1 - i % 2], v2, modulus); mpres_sub (v[i % 2], r_x, v[i % 2], modulus); if (want_output && test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, v[i % 2], modulus); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ " "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n", i, t); mpz_clear (t); } } mpres_clear (r_x, modulus); mpres_clear (r_y, modulus); mpres_clear (x0_x, modulus); mpres_clear (x0_y, modulus); mpres_clear (v2, modulus); for (i = 0; i < 2; i++) { mpres_clear (r1_x[i], modulus); mpres_clear (r1_y[i], modulus); mpres_clear (r2_x[i], modulus); mpres_clear (r2_y[i], modulus); mpres_clear (v[i], modulus); } for (i = 0; i < tmplen; i++) mpres_clear (tmp[i], modulus); mpz_clear (mt); mpmod_clear (modulus); #ifdef _OPENMP } #endif print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE)) { for (i = 0; i < l_param; i++) { outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = " "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i); outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == " "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]); } } } /* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is an element of norm 1 in the quadratic extension ring */ static void pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt, const listz_t f, const mpres_t b1_x, const mpres_t b1_y, const long k_param, const unsigned long l_param, const unsigned long P, const mpres_t Delta, mpmod_t modulus_param, const mpzspm_t ntt_context) { unsigned long i; long timestart, realstart; if (l_param == 0UL) return; ASSERT (f != h_x); ASSERT (f != h_y); outputf (OUTPUT_VERBOSE, "Computing h_x and h_y"); timestart = cputime (); realstart = realtime (); if (test_verbose (OUTPUT_TRACE)) { mpz_t t; mpz_init (t); mpres_get_z (t, Delta, modulus_param); outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; " "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; " "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P); outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = "); gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE); outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n"); for (i = 0; i < l_param; i++) outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]); mpz_clear (t); } #ifdef _OPENMP #pragma omp parallel if (l_param > 100) private(i) #endif { const size_t tmplen = 2; mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y, tmp[2]; mpmod_t modulus; /* Thread-local copy of modulus_param */ mpz_t mt; unsigned long l = l_param, offset = 0; long k = k_param; #ifdef _OPENMP /* When multi-threading, we adjust the parameters for each thread */ const int nr_chunks = omp_get_num_threads(); const int thread_nr = omp_get_thread_num(); l = (l_param - 1) / nr_chunks + 1; offset = thread_nr * l; ASSERT_ALWAYS (l_param >= offset); l = MIN(l, l_param - offset); if (thread_nr == 0) outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks); outputf (OUTPUT_TRACE, "\n"); #endif /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */ /* Test that k+offset does not overflow */ ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX && k <= LONG_MAX - (long) offset); k += (long) offset; mpz_init (mt); /* Make thread-local copy of modulus */ mpmod_init_set (modulus, modulus_param); /* Init the local mpres_t variables */ for (i = 0; i < 2; i++) { mpres_init (s_x[i], modulus); mpres_init (s_y[i], modulus); mpres_init (s2_x[i], modulus); mpres_init (s2_y[i], modulus); mpres_init (v[i], modulus); } mpres_init (s_x[2], modulus); mpres_init (s_y[2], modulus); mpres_init (V2, modulus); mpres_init (rn_x, modulus); mpres_init (rn_y, modulus); for (i = 0; i < (unsigned long) tmplen; i++) mpres_init (tmp[i], modulus); /* Compute rn = b_1^{-P}. It has the same value for all threads, but we make thread local copies anyway. */ gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen, tmp); mpres_neg (rn_y, rn_y, modulus); /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two exponentiations by k and use v[0] and v[1] as temp storage */ gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus, tmplen, tmp); gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus, tmplen, tmp); if (test_verbose (OUTPUT_TRACE)) { #ifdef _OPENMP #pragma omp critical #endif { outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k); gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/ if (l > 1) { /* v[0] + v[1]*sqrt(Delta) still contains rn^k */ gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus); /* Now s[1] = r^(-2k) */ gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta, modulus, tmplen, tmp); /* Now s[1] = r^(-(k^2 + 2k)) */ gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta, modulus, tmplen, tmp); /* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */ if (test_verbose (OUTPUT_TRACE)) { #ifdef _OPENMP #pragma omp critical #endif { outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k + 1); gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } } /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */ gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus); gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus, tmplen, tmp); if (test_verbose (OUTPUT_TRACE)) { #ifdef _OPENMP #pragma omp critical #endif { outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k); gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */ gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus, tmplen, tmp); if (test_verbose (OUTPUT_TRACE)) { #ifdef _OPENMP #pragma omp critical #endif { outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k + 1); gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, " /* PARI C */\n"); } } /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x. V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */ mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r = 2*rn_x */ V (v[0], V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (r + 1/r) */ V (v[1], V2, 2 * k + 3, modulus); /* v[1] = V_{2k+3} (r + 1/r) */ mpres_sqr (V2, V2, modulus); /* V2 = 4*a_x^2 */ mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */ if (test_verbose (OUTPUT_TRACE)) { #ifdef _OPENMP #pragma omp critical #endif { mpres_get_z (mt, V2, modulus); outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd " "/* PARI C */\n", mt); mpres_get_z (mt, v[0], modulus); outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + " "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt); mpres_get_z (mt, v[1], modulus); outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + " "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt); } } for (i = 0; i < 2UL && i < l; i++) { /* Multiply the 2nd coordinate by Delta, so that after the polynomial multipoint evaluation we get x1 + Delta*x2 */ mpres_mul (s_y[i], s_y[i], Delta, modulus); mpres_mul (s2_y[i], s2_y[i], Delta, modulus); if (h_x != NULL) mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus); if (h_y != NULL) mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus); if (h_x_ntt != NULL) { mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus); mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context); } if (h_y_ntt != NULL) { mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus); mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context); } } /* Compute the remaining r^((k+i)^2) values according to Peter's recurrence */ for (i = 2; i < l; i++) { if (h_x != NULL || h_x_ntt != NULL) { /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken modulo 2 */ mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus); mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus); /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */ mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus); if (h_x != NULL) mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset], modulus); if (h_x_ntt != NULL) { mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus); mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context); } } if (h_y != NULL || h_y_ntt != NULL) { /* Same for y coordinate */ mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus); mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus); mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus); mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus); if (h_y != NULL) mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset], modulus); if (h_y_ntt != NULL) { mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus); mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context); } } /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */ mpres_mul (tmp[0], v[1 - i % 2], V2, modulus); mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus); } /* Clear the local mpres_t variables */ for (i = 0; i < 2; i++) { mpres_clear (s_x[i], modulus); mpres_clear (s_y[i], modulus); mpres_clear (s2_x[i], modulus); mpres_clear (s2_y[i], modulus); mpres_clear (v[i], modulus); } mpres_clear (s_x[2], modulus); mpres_clear (s_y[2], modulus); mpres_clear (V2, modulus); mpres_clear (rn_x, modulus); mpres_clear (rn_y, modulus); for (i = 0; i < tmplen; i++) mpres_clear (tmp[i], modulus); /* Clear the thread-local copy of modulus */ mpmod_clear (modulus); mpz_clear (mt); } print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE)) { for (i = 0; i < l_param; i++) gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == " "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n", i, i, h_x[i], h_y[i]); } } int pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus, const faststage2_param_t *params) { unsigned long nr; unsigned long i, l, lenF, lenH, lenG, lenR, tmplen; sets_long_t *S_1; /* This is stored as a set of sets (arithmetic progressions of prime length */ set_long_t *S_2; /* This is stored as a regular set */ listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has degree s_1. It is symmetric, so has only s_1 / 2 + 1 distinct coefficients. The sequence h_j will be stored in the same memory and won't be a monic polynomial, so the leading 1 monomial of F will be stored explicitly. Hence we need s_1 / 2 + 1 entries. */ listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y; const unsigned long tmpreslen = 2UL; mpres_t b1_x, b1_y, Delta, tmpres[2]; mpz_t mt; /* All-purpose temp mpz_t */ int youpi = ECM_NO_FACTOR_FOUND; long timetotalstart, realtotalstart, timestart; timetotalstart = cputime (); realtotalstart = realtime (); ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2); ASSERT_ALWAYS (params->s_1 < params->l); nr = params->l - params->s_1; /* Number of points we evaluate */ if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR) return ECM_ERROR; /* Allocate all the memory we'll need */ /* Allocate the correct amount of space for each mpz_t or the reallocations will up to double the time for stage 2! */ mpz_init (mt); mpres_init (b1_x, modulus); mpres_init (b1_y, modulus); mpres_init (Delta, modulus); for (i = 0; i < tmpreslen; i++) mpres_init (tmpres[i], modulus); lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores the leading 1 monomial for each factor */ lenH = params->s_1 + 1; lenG = params->l; lenR = nr; F = init_list2 (lenF, (unsigned int) abs (modulus->bits)); fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits)); fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits)); h_x = malloc (lenH * sizeof (mpz_t)); h_y = malloc (lenH * sizeof (mpz_t)); if (h_x == NULL || h_y == NULL) { fprintf (stderr, "Cannot allocate memory in pp1fs2\n"); exit (1); } g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits)); g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits)); R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits)); R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits)); tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20; outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen); if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen) { tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12; /* FIXME: It appears TMulGen_space() returns a too small value! */ outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n", tmplen); } tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits)); if (test_verbose (OUTPUT_TRACE)) { mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */ outputf (OUTPUT_TRACE, "N = %Zd; X = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, mt); } /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */ outputf (OUTPUT_VERBOSE, "Computing F from factored S_1"); timestart = cputime (); i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL); ASSERT_ALWAYS(2 * i == params->s_1); ASSERT(mpz_cmp_ui (F[i], 1UL) == 0); free (S_1); S_1 = NULL; outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]); outputf (OUTPUT_TRACE, "f(x) = f_0"); for (i = 1; i < params->s_1 / 2 + 1; i++) outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i); outputf (OUTPUT_TRACE, "/* PARI */ \n"); } /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */ mpres_sqr (Delta, X, modulus); mpres_sub_ui (Delta, Delta, 4UL, modulus); mpres_div_2exp (b1_x, X, 1, modulus); mpres_set_ui (b1_y, 1UL, modulus); mpres_div_2exp (b1_y, b1_y, 1, modulus); if (test_verbose (OUTPUT_TRACE)) { mpres_get_z (mt, Delta, modulus); outputf (OUTPUT_TRACE, "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt); gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, "; /* PARI */\n"); outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n"); } /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */ pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L, params->s_1 / 2 + 1, params->P, Delta, modulus, NULL); /* We don't need F(x) any more */ clear_list (F, lenF); /* Make a symmetric copy of fh in h. */ for (i = 0; i < params->s_1 / 2 + 1; i++) { *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */ *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]); } for (i = 0; i < params->s_1 / 2; i++) { *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]); *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]); } if (test_verbose (OUTPUT_TRACE)) { for (i = 0; i < params->s_1 + 1; i++) outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n", i, h_x[i], h_y[i]); } for (l = 0; l < params->s_2; l++) { const long M = params->l - 1 - params->s_1 / 2; outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", l + 1, params->s_2); pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P, Delta, M, params->l, params->m_1, S_2->elem[l], modulus, NULL); /* Do the two convolution products */ outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x"); timestart = cputime (); if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp, modulus->orig_modulus) < 0) { outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out " "of memory)\n"); youpi = ECM_ERROR; break; } outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y"); timestart = cputime (); if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp, modulus->orig_modulus) < 0) { outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out " "of memory)\n"); youpi = ECM_ERROR; break; } outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart); for (i = 0; i < nr; i++) mpz_add (R_x[i], R_x[i], R_y[i]); timestart = cputime (); mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in tmpres[1] */ for (i = 0; i < nr; i++) { mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus); #define TEST_ZERO_RESULT #ifdef TEST_ZERO_RESULT if (mpres_is_zero (tmpres[0], modulus)) outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i); #endif mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus); } outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n", cputime () - timestart); if (test_verbose(OUTPUT_RESVERBOSE)) { mpres_get_z (mt, tmpres[1], modulus); outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd (times some " "power of 2 if REDC was used! Try -mpzmod)\n", mt); } mpres_gcd (mt, tmpres[1], modulus); if (mpz_cmp_ui (mt, 1UL) > 0) { mpz_set (f, mt); youpi = ECM_FACTOR_FOUND_STEP2; break; } } mpz_clear (mt); mpres_clear (b1_x, modulus); mpres_clear (b1_y, modulus); mpres_clear (Delta, modulus); for (i = 0; i < tmpreslen; i++) mpres_clear (tmpres[i], modulus); clear_list (fh_x, lenF); clear_list (fh_y, lenF); free (h_x); free (h_y); clear_list (g_x, lenG); clear_list (g_y, lenG); clear_list (R_x, lenR); clear_list (R_y, lenR); clear_list (tmp, tmplen); free (S_2); outputf (OUTPUT_NORMAL, "Step 2"); /* In normal output mode, print only cpu time as we always have. In verbose mode, print real time as well if we used multi-threading */ if (test_verbose (OUTPUT_VERBOSE)) print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart); else print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L); return youpi; } int pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus, const faststage2_param_t *params, const int twopass) { unsigned long nr; unsigned long l, lenF; sets_long_t *S_1; /* This is stored as a set of sets (arithmetic progressions of prime length */ set_long_t *S_2; /* This is stored as a regular set */ listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has degree s_1. It is symmetric, so has only s_1 / 2 + 1 distinct coefficients. The sequence h_j will be stored in the same memory and won't be a monic polynomial, so the leading 1 monomial of F will be stored explicitly. Hence we need s_1 / 2 + 1 entries. */ listz_t R = NULL; /* Is used only for two-pass convolution, has nr entries. R is only ever referenced if twopass == 1, but gcc does not realize that and complains about uninitialized value, so we set it to NULL. */ mpzspm_t ntt_context; mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt; mpres_t b1_x, b1_y, Delta; mpz_t mt; /* All-purpose temp mpz_t */ mpz_t product; mpz_t *product_ptr = NULL; int youpi = ECM_NO_FACTOR_FOUND; long timetotalstart, realtotalstart, timestart, realstart; timetotalstart = cputime (); realtotalstart = realtime (); ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2); ASSERT_ALWAYS (params->s_1 < params->l); nr = params->l - params->s_1; /* Number of points we evaluate */ if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR) return ECM_ERROR; mpz_init (mt); /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution with g. We need NTT of transform length l here. If we want to add transformed vectors, we need to double the modulus. */ if (twopass) mpz_set (mt, modulus->orig_modulus); else mpz_mul_2exp (mt, modulus->orig_modulus, 1UL); ntt_context = mpzspm_init (params->l, mt); if (ntt_context == NULL) { outputf (OUTPUT_ERROR, "Could not initialise ntt_context, " "presumably out of memory\n"); mpz_clear (mt); free (S_1); S_1 = NULL; free (S_2); S_2 = NULL; return ECM_ERROR; } print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ", ntt_context); /* Allocate memory for F with correct amount of space for each mpz_t */ lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores the leading 1 monomial for each factor */ MEMORY_TAG; F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS); MEMORY_UNTAG; /* Build F */ if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR) { free (S_1); free (S_2); mpz_clear (mt); mpzspm_clear (ntt_context); clear_list (F, lenF); return ECM_ERROR; } free (S_1); S_1 = NULL; mpres_init (b1_x, modulus); mpres_init (b1_y, modulus); mpres_init (Delta, modulus); /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */ mpres_sqr (Delta, X, modulus); mpres_sub_ui (Delta, Delta, 4UL, modulus); mpres_div_2exp (b1_x, X, 1, modulus); mpres_set_ui (b1_y, 1UL, modulus); mpres_div_2exp (b1_y, b1_y, 1, modulus); if (test_verbose (OUTPUT_TRACE)) { mpres_get_z (mt, Delta, modulus); outputf (OUTPUT_TRACE, "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt); gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE); outputf (OUTPUT_TRACE, "; /* PARI */\n"); outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n"); } /* Allocate remaining memory for h_ntt */ h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context); h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context); /* Compute the h_j sequence */ pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L, params->s_1 / 2 + 1, params->P, Delta, modulus, ntt_context); /* We don't need F(x) any more */ clear_list (F, lenF); /* compute the forward transform of h and store the distinct coefficients in h_ntt */ g_x_ntt = mpzspv_init (params->l, ntt_context); if (twopass) { g_y_ntt = g_x_ntt; MEMORY_TAG; R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) * GMP_NUMB_BITS); MEMORY_UNTAG; } else g_y_ntt = mpzspv_init (params->l, ntt_context); /* Compute DCT-I of h_x and h_y */ outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1, g_x_ntt, ntt_context); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1, g_x_ntt, ntt_context); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_init (product); product_ptr = &product; } for (l = 0; l < params->s_2; l++) { const long M = params->l - 1 - params->s_1 / 2; outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n", l + 1, params->s_2); if (twopass) { /* Two-pass variant. Two separate convolutions, then addition in Z/NZ */ pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P, Delta, M, params->l, params->m_1, S_2->elem[l], modulus, ntt_context); /* Do the convolution product of g_x * h_x */ outputf (OUTPUT_VERBOSE, "Computing g_x*h_x"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); /* Store the product coefficients we want in R */ mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); /* Compute g_y sequence */ pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P, Delta, M, params->l, params->m_1, S_2->elem[l], modulus, ntt_context); /* Do the convolution product of g_y * (Delta * h_y) */ outputf (OUTPUT_VERBOSE, "Computing g_y*h_y"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); /* Compute product of sum of coefficients and gcd with N */ ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr, ntt_context, modulus); } else { /* One-pass variant. Two forward transforms and point-wise products, then addition and single inverse transform */ pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P, Delta, M, params->l, params->m_1, S_2->elem[l], modulus, ntt_context); outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum"); #ifdef _OPENMP outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit()); #endif timestart = cputime (); realstart = realtime (); mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0, g_y_ntt, (spv_size_t) 0, params->l, ntt_context); mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context, NTT_MUL_STEP_IFFT); print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart); ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr, ntt_context, modulus); } outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd (times some " "power of 2 if REDC was used! Try -mpzmod)\n", product); if (mpz_cmp_ui (mt, 1UL) > 0) { mpz_set (f, mt); youpi = ECM_FACTOR_FOUND_STEP2; break; } } if (test_verbose (OUTPUT_RESVERBOSE)) { product_ptr = NULL; mpz_clear (product); } mpzspv_clear (g_x_ntt, ntt_context); if (twopass) clear_list (R, nr); else mpzspv_clear (g_y_ntt, ntt_context); mpzspv_clear (h_x_ntt, ntt_context); mpzspv_clear (h_y_ntt, ntt_context); mpzspm_clear (ntt_context); mpz_clear (mt); mpres_clear (b1_x, modulus); mpres_clear (b1_y, modulus); mpres_clear (Delta, modulus); free (S_2); outputf (OUTPUT_NORMAL, "Step 2"); /* In normal output mode, print only cpu time as we always have. In verbose mode, print real time as well if we used multi-threading */ if (test_verbose (OUTPUT_VERBOSE)) print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart); else print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L); return youpi; } ecm-6.4.4/mul_fft-params.h.athlon640000644023561000001540000001121212106741273013737 00000000000000#define MUL_FFT_MODF_THRESHOLD 300 #define SQR_FFT_MODF_THRESHOLD 568 #define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {401, 5 /*96*/}, {417, 4 /*98*/}, {433, 5 /*96*/}, {865, 6 /*96*/}, {897, 5 /*98*/}, {929, 6 /*96*/}, {2113, 7 /*97*/}, {2177, 6 /*98*/}, {2241, 7 /*97*/}, {2305, 6 /*98*/}, {2369, 7 /*97*/}, {3713, 8 /*93*/}, {3841, 7 /*98*/}, {4225, 8 /*94*/}, {4353, 7 /*98*/}, {4481, 8 /*94*/}, {4865, 7 /*98*/}, {4993, 8 /*95*/}, {6913, 9 /*87*/}, {7169, 8 /*96*/}, {7425, 9 /*93*/}, {7681, 8 /*96*/}, {8449, 9 /*94*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*90*/}, {11777, 8 /*97*/}, {12033, 9 /*92*/}, {13825, 10 /*87*/}, {14337, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20993, 10 /*87*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {26113, 10 /*92*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {70657, 11 /*87*/}, {71681, 10 /*98*/}, {72705, 11 /*90*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {81921, 10 /*96*/}, {82945, 11 /*85*/}, {96257, 10 /*98*/}, {97281, 12 /*75*/}, {98305, 10 /*97*/}, {101377, 12 /*78*/}, {102401, 11 /*91*/}, {110593, 12 /*87*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 11 /*98*/}, {194561, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 11 /*99*/}, {276481, 12 /*85*/}, {282625, 11 /*96*/}, {284673, 12 /*87*/}, {389121, 11 /*99*/}, {391169, 13 /*75*/}, {434177, 12 /*95*/}, {438273, 13 /*84*/}, {516097, 12 /*99*/}, {585729, 11 /*99*/}, {620545, 13 /*79*/}, {630785, 12 /*96*/}, {651265, 13 /*83*/}, {778241, 12 /*99*/}, {782337, 11 /*99*/}, {817153, 12 /*96*/}, {819201, 14 /*79*/}, {1032193, 13 /*99*/}, {1040385, 11 /*99*/}, {1046529, 12 /*94*/}, {LONG_MAX, 0}} #define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {385, 4 /*98*/}, {401, 5 /*96*/}, {801, 6 /*96*/}, {833, 5 /*98*/}, {865, 6 /*96*/}, {1729, 7 /*96*/}, {1793, 6 /*98*/}, {1857, 7 /*96*/}, {2049, 6 /*98*/}, {2113, 7 /*97*/}, {3841, 8 /*96*/}, {4097, 7 /*98*/}, {4225, 8 /*97*/}, {4609, 7 /*98*/}, {4737, 8 /*97*/}, {7169, 9 /*93*/}, {7681, 8 /*98*/}, {8449, 9 /*94*/}, {8705, 8 /*98*/}, {8961, 9 /*94*/}, {9217, 8 /*98*/}, {9473, 9 /*95*/}, {14849, 10 /*93*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {20481, 10 /*95*/}, {21505, 9 /*97*/}, {22017, 10 /*91*/}, {23553, 9 /*97*/}, {24065, 10 /*92*/}, {29697, 11 /*93*/}, {30721, 10 /*96*/}, {37889, 11 /*95*/}, {38913, 10 /*97*/}, {44033, 11 /*91*/}, {47105, 10 /*97*/}, {52225, 11 /*92*/}, {55297, 10 /*98*/}, {56321, 11 /*87*/}, {63489, 10 /*98*/}, {64513, 11 /*88*/}, {79873, 12 /*83*/}, {81921, 11 /*93*/}, {88065, 12 /*91*/}, {94209, 11 /*97*/}, {104449, 12 /*81*/}, {110593, 11 /*98*/}, {112641, 12 /*87*/}, {126977, 11 /*98*/}, {137217, 12 /*85*/}, {159745, 11 /*98*/}, {161793, 12 /*83*/}, {167937, 11 /*98*/}, {169985, 12 /*87*/}, {192513, 11 /*98*/}, {194561, 12 /*85*/}, {196609, 11 /*97*/}, {202753, 12 /*89*/}, {217089, 13 /*84*/}, {221185, 12 /*98*/}, {225281, 13 /*87*/}, {253953, 12 /*98*/}, {323585, 13 /*83*/}, {385025, 12 /*98*/}, {389121, 14 /*75*/}, {393217, 12 /*93*/}, {405505, 14 /*78*/}, {507905, 13 /*98*/}, {516097, 12 /*99*/}, {552961, 13 /*85*/}, {573441, 12 /*97*/}, {577537, 13 /*88*/}, {778241, 12 /*99*/}, {782337, 13 /*85*/}, {851969, 14 /*82*/}, {868353, 13 /*95*/}, {909313, 14 /*87*/}, {1032193, 13 /*99*/}, {LONG_MAX, 0}} #define MUL_FFT_FULL_TABLE2 {{16, 1}, {4224, 2}, {4416, 6}, {4480, 2}, {4608, 4}, {4640, 2}, {4800, 1}, {5120, 2}, {5184, 1}, {5632, 2}, {5760, 1}, {6656, 4}, {6720, 1}, {7168, 4}, {7360, 1}, {7936, 4}, {8000, 2}, {8064, 1}, {8704, 2}, {8832, 6}, {8960, 3}, {9216, 1}, {13312, 6}, {14336, 3}, {15360, 5}, {16896, 6}, {17920, 1}, {19968, 2}, {20736, 1}, {21504, 2}, {23808, 1}, {28672, 4}, {29440, 2}, {29952, 1}, {33792, 2}, {35328, 1}, {36864, 4}, {37120, 1}, {49152, 4}, {49920, 1}, {50176, 3}, {53248, 1}, {55296, 2}, {59904, 3}, {61440, 1}, {65536, 2}, {70656, 6}, {71680, 2}, {72192, 5}, {73728, 4}, {79360, 1}, {81920, 2}, {82944, 1}, {86016, 2}, {89088, 1}, {90112, 2}, {95232, 1}, {100352, 5}, {110592, 1}, {114688, 4}, {117760, 1}, {131072, 2}, {144384, 5}, {147456, 4}, {158720, 1}, {161792, 3}, {163840, 2}, {190464, 1}, {196608, 4}, {199680, 3}, {212992, 1}, {262144, 6}, {272384, 7}, {294912, 6}, {301056, 4}, {322560, 1}, {327680, 3}, {344064, 2}, {380928, 1}, {385024, 2}, {387072, 1}, {393216, 7}, {425984, 6}, {444416, 5}, {466944, 1}, {520192, 2}, {577536, 7}, {589824, 6}, {602112, 4}, {645120, 3}, {688128, 2}, {774144, 1}, {786432, 6}, {788480, 4}, {808960, 5}, {811008, 2}, {817152, 3}, {819200, 5}, {823296, 2}, {829440, 1}, {1048576, 2}, {1069056, 1}, {1073152, 5}, {1081344, 3}, {1089536, 2}, {LONG_MAX, 1}} ecm-6.4.4/ecm-params.h.pentium40000644023561000001540000000112112106741273013153 00000000000000/* those parameters were generated on 3 Jan 2012 on macaron.loria.fr (Intel(R) Pentium(R) 4 CPU 3.20GHz) for ecm-6.4 with GMP 5.0.2 */ #define MPZMOD_THRESHOLD 84 #define REDC_THRESHOLD 119 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 4096 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 1024 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 2048 ecm-6.4.4/ecm-params.h.ia640000644023561000001540000000160112106741273012154 00000000000000/* those parameters were obtained on gcc60.fsffrance.org with ecm-6.3-rc3 gmp-5.0.1, and gcc 4.3.2 -O2 -pedantic -mtune=itanium2 (ia64-unknown-linux-gnu) */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} #define MPZMOD_THRESHOLD 61 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 12, 1, 14, 14, 16, 1, 18, 19, 16, 20, 18, 19, 18, 19, 20, 21} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 17 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 17 #define MUL_NTT_THRESHOLD 262144 #define PREREVERTDIVISION_NTT_THRESHOLD 262144 #define POLYINVERT_NTT_THRESHOLD 262144 #define POLYEVALT_NTT_THRESHOLD 262144 #define MPZSPV_NORMALISE_STRIDE 2048 ecm-6.4.4/pentium4/0000755023561000001540000000000012113421640011032 500000000000000ecm-6.4.4/pentium4/autogen.py0000755023561000001540000001640412106741272013006 00000000000000#!/usr/bin/python import re import sys def offaddr(addr, offset): if offset == 0: return "("+addr+")" else: return str(offset)+"("+addr+")" # Generate asm for addmul1_k # src and dst are pointers (stored in regs) + offsets # multiplier is in a register # rax, rbx, rcx, rdx are free for use. def addmul1_k(src, off_src, dst, off_dst, mult, k): init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %eax, %ebx, %ecx, %edx\n" init = init + "### dst[0,k[ += mult*src[0,k[ plus carry put in ecx or ebx\n" init = init + " movl " + offaddr(src, off_src) + ", %eax\n" init = init + " mull " + mult + "\n" init = init + " movl %eax, %ebx\n" init = init + " movl %edx, %ecx\n" block = """ movl __xii__, %eax mull __mult__ addl __cylo__, __zi__ adcl %eax, __cyhi__ movl %edx, __cylo__ adcl $0, __cylo__ """ code = init cylo = "%ebx" cyhi = "%ecx" for i in range(0,k-1): blocki = re.sub('__cylo__', cylo, block) blocki = re.sub('__cyhi__', cyhi, blocki) blocki = re.sub('__xii__', offaddr(src, off_src+(i+1)*4), blocki) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*4), blocki) blocki = re.sub('__mult__', mult, blocki) code = code + blocki tmp = cylo cylo = cyhi cyhi = tmp final = " addl " + cylo + ", " + offaddr(dst, off_dst+4*(k-1)) + "\n" final = final + " adcl $0, " + cyhi + "\n" final = final + "### carry limb is in " + cyhi + "\n" code = code + final return code, cyhi ### Try mmx/sse2 addmul_1, copying the one of GMP for Pentium4 def addmul1_k_var(src, off_src, dst, off_dst, mult, k): init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %eax, %edx and mmx regs \n" init = init + "### dst[0,k[ += mult*src[0,k[ plus carry put in ecx\n" init = init + " pxor %mm0, %mm0\n" init = init + " movd " + mult + ", %mm7\n" block = """ movd __xi__, %mm1 movd __zi__, %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, __zi__ psrlq $32, %mm0 """ code = init for i in range(0,k): blocki = re.sub('__xi__', offaddr(src, off_src+i*4), block) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*4), blocki) code = code + blocki final = " movd %mm0, %ecx\n" final = final + "### carry limb is in %ecx\n" code = code + final return code, "%ecx" def mulredc_k_rolled(k): header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc__k TYPE(GSYM_PREFIX`'mulredc__k,`function') GSYM_PREFIX`'mulredc__k: """ init = re.sub("__k", str(k), header) INV_M = offaddr("%esp", 4*(2*k+1) + 40) ADDR_M = offaddr("%esp", 4*(2*k+1) + 36) ADDR_Y = offaddr("%esp", 4*(2*k+1) + 32) ADDR_X = offaddr("%esp", 4*(2*k+1) + 28) ADDR_Z = offaddr("%esp", 4*(2*k+1) + 24) init = init + """ pushl %ebp pushl %edi pushl %esi pushl %ebx """ init = init + " subl $" + str(4*(2*k+2)) + ", %esp\n" init = init + " movl %esp, %edi\n" init = init + "### set tmp[0..2k+1[ to 0\n" for i in range(0,2*k+1): init = init + " movl $0, " + offaddr("%edi", 4*i) + "\n" code = init middle_code = "###########################################\n" middle_code = middle_code + " movl $" + str(k) + ", " + offaddr("%esp", 4*(2*k+1)) + "\n" middle_code = middle_code + """ .align 32 Loop: ## compute u and store in %ebp """ middle_code = middle_code + " movl " + ADDR_X + ", %eax\n" middle_code = middle_code + " movl " + ADDR_Y + ", %esi\n" middle_code = middle_code + """ movl (%eax), %eax mull (%esi) addl (%edi), %eax """ middle_code = middle_code + " mull " + INV_M + "\n" middle_code = middle_code + " movl %eax, %ebp\n" middle_code = middle_code + " movl " + ADDR_M + ", %esi\n" codeaddmul, carry = addmul1_k_var("%esi", 0, "%edi", 0, "%ebp", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addl " + carry + ", " + offaddr("%edi", 4*k) + "\n" middle_code = middle_code + " adcl $0, " + offaddr("%edi", 4*(k+1)) + "\n" middle_code = middle_code + " movl " + ADDR_X + ", %eax\n" middle_code = middle_code + " movl (%eax), %ebp\n" middle_code = middle_code + " movl " + ADDR_Y + ", %esi\n" codeaddmul, carry = addmul1_k_var("%esi", 0, "%edi", 0, "%ebp", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addl " + carry + ", " + offaddr("%edi", 4*k) + "\n" middle_code = middle_code + " adcl $0, " + offaddr("%edi", 4*(k+1)) + "\n\n" middle_code = middle_code + " addl $4, " + ADDR_X + "\n addl $4, %edi\n" middle_code = middle_code + " decl " + offaddr("%esp", 4*(2*k+1)) + "\n jnz Loop\n" code = code + middle_code final = "###########################################\n" final = final + "### Copy result in z\n" final = final + " movl " + ADDR_Z + ", %ebx\n" for i in range(0,k): final = final + " movl " + offaddr("%edi", 4*i) + ", %eax\n" final = final + " movl %eax, " + offaddr("%ebx", 4*i) + "\n" final = final + " movl " + offaddr("%edi", 4*k) + ", %eax # carry\n" final = final + " addl $" + str(4*(2*k+2)) + ", %esp\n" final = final + " popl %ebx\n" final = final + " popl %esi\n" final = final + " popl %edi\n" final = final + " popl %ebp\n" final = final + " emms\n" final = final + " ret\n" code = code + final return code k = int(sys.argv[1]) if k == 1: print """# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') GSYM_PREFIX`'mulredc1: # Stack: # inv_m 20(%esp) # m 16 # y 12(%esp) # x 8 # z 4(%esp) movl 12(%esp), %eax mull 8(%esp) movl %edx, 12(%esp) movl %eax, 8(%esp) # store xy in [8(%esp):12(%esp)] mull 20(%esp) # compute u mull 16(%esp) # compute u*m addl 8(%esp), %eax # eax is 0, now (carry is important) adcl 12(%esp), %edx movl 4(%esp), %ecx movl %edx, (%ecx) adcl $0, %eax ret """ else: print mulredc_k_rolled(k) ecm-6.4.4/pentium4/mulredc3.asm0000644023561000001540000000560612106741272013211 00000000000000# mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc3 TYPE(GSYM_PREFIX`'mulredc3,`function') GSYM_PREFIX`'mulredc3: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $32, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) ########################################### movl $3, 28(%esp) .align 32 Loop: ## compute u and store in %ebp movl 56(%esp), %eax movl 60(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 68(%esp) movl %eax, %ebp movl 64(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 3 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 12(%edi) adcl $0, 16(%edi) movl 56(%esp), %eax movl (%eax), %ebp movl 60(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 3 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 12(%edi) adcl $0, 16(%edi) addl $4, 56(%esp) addl $4, %edi decl 28(%esp) jnz Loop ########################################### ### Copy result in z movl 52(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax # carry addl $32, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc.h0000644023561000001540000000462512106741272012575 00000000000000#ifndef __ASM_REDC_H__ #define __ASM_REDC_H__ #include /* Signals that we have assembly code for variable size redc */ #define HAVE_ASM_REDC3 extern void ecm_redc3(mp_limb_t *, const mp_limb_t *, mp_size_t, mp_limb_t); /* WARNING: the size-1 version doesn't take pointers in input */ extern mp_limb_t mulredc1(mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); extern mp_limb_t mulredc2(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc3(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc4(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc5(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc6(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc7(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc8(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc9(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc10(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc11(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc12(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc13(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc14(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc15(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc16(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc17(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc18(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc19(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc20(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); #endif ecm-6.4.4/pentium4/mulredc14.asm0000644023561000001540000001532312106741272013270 00000000000000# mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc14 TYPE(GSYM_PREFIX`'mulredc14,`function') GSYM_PREFIX`'mulredc14: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $120, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) ########################################### movl $14, 116(%esp) .align 32 Loop: ## compute u and store in %ebp movl 144(%esp), %eax movl 148(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 156(%esp) movl %eax, %ebp movl 152(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 14 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 56(%edi) adcl $0, 60(%edi) movl 144(%esp), %eax movl (%eax), %ebp movl 148(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 14 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 56(%edi) adcl $0, 60(%edi) addl $4, 144(%esp) addl $4, %edi decl 116(%esp) jnz Loop ########################################### ### Copy result in z movl 140(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax # carry addl $120, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc5.asm0000644023561000001540000000711212106741272013205 00000000000000# mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc5 TYPE(GSYM_PREFIX`'mulredc5,`function') GSYM_PREFIX`'mulredc5: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $48, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) ########################################### movl $5, 44(%esp) .align 32 Loop: ## compute u and store in %ebp movl 72(%esp), %eax movl 76(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 84(%esp) movl %eax, %ebp movl 80(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 5 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 20(%edi) adcl $0, 24(%edi) movl 72(%esp), %eax movl (%eax), %ebp movl 76(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 5 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 20(%edi) adcl $0, 24(%edi) addl $4, 72(%esp) addl $4, %edi decl 44(%esp) jnz Loop ########################################### ### Copy result in z movl 68(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax # carry addl $48, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/Makefile.dev0000644023561000001540000000160412106741272013200 00000000000000.PHONY: all all: test_mulredc bench CFLAGS:=-g -O2 -funroll-loops ALLMULRED:= mulredc1.o mulredc2.o mulredc3.o mulredc4.o mulredc5.o\ mulredc6.o mulredc7.o mulredc8.o mulredc9.o mulredc10.o\ mulredc11.o mulredc12.o mulredc13.o mulredc14.o\ mulredc15.o mulredc16.o mulredc17.o mulredc18.o\ mulredc19.o mulredc20.o redc.s: redc.asm m4 redc.asm > redc.s redc.o: redc.s gcc -c $(CFLAGS) redc.s -o redc.o mulredc%.o: mulredc%.asm m4 $< > tmp-mulred.s gcc -c $(CFLAGS) tmp-mulred.s -o $@ rm tmp-mulred.s mulredc%.asm: ./autogen.py ./autogen.py $* > $@ test_mulredc: test_mulredc.c redc.o $(ALLMULRED) gcc -o test_mulredc $(CFLAGS) test_mulredc.c $(ALLMULRED) redc.o -lgmp bench: bench.c redc.o $(ALLMULRED) gcc -o bench $(CFLAGS) bench.c $(ALLMULRED) redc.o -lgmp clean: rm redc.s *.o mulredc[0-9]*.s mulredc[0-9]*.asm test_mulredc ecm-6.4.4/pentium4/generate_all0000755023561000001540000000016312106741272013332 00000000000000#!/bin/sh for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do ./autogen.py $i > mulredc$i.asm done ecm-6.4.4/pentium4/mulredc6.asm0000644023561000001540000000765412106741272013221 00000000000000# mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc6 TYPE(GSYM_PREFIX`'mulredc6,`function') GSYM_PREFIX`'mulredc6: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $56, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) ########################################### movl $6, 52(%esp) .align 32 Loop: ## compute u and store in %ebp movl 80(%esp), %eax movl 84(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 92(%esp) movl %eax, %ebp movl 88(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 6 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 24(%edi) adcl $0, 28(%edi) movl 80(%esp), %eax movl (%eax), %ebp movl 84(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 6 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 24(%edi) adcl $0, 28(%edi) addl $4, 80(%esp) addl $4, %edi decl 52(%esp) jnz Loop ########################################### ### Copy result in z movl 76(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax # carry addl $56, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc2.asm0000644023561000001540000000505112106741272013202 00000000000000# mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc2 TYPE(GSYM_PREFIX`'mulredc2,`function') GSYM_PREFIX`'mulredc2: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $24, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) ########################################### movl $2, 20(%esp) .align 32 Loop: ## compute u and store in %ebp movl 48(%esp), %eax movl 52(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 60(%esp) movl %eax, %ebp movl 56(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 2 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 8(%edi) adcl $0, 12(%edi) movl 48(%esp), %eax movl (%eax), %ebp movl 52(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 2 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 8(%edi) adcl $0, 12(%edi) addl $4, 48(%esp) addl $4, %edi decl 20(%esp) jnz Loop ########################################### ### Copy result in z movl 44(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax # carry addl $24, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc20.asm0000644023561000001540000002145312106741272013266 00000000000000# mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc20 TYPE(GSYM_PREFIX`'mulredc20,`function') GSYM_PREFIX`'mulredc20: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $168, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) movl $0, 148(%edi) movl $0, 152(%edi) movl $0, 156(%edi) movl $0, 160(%edi) ########################################### movl $20, 164(%esp) .align 32 Loop: ## compute u and store in %ebp movl 192(%esp), %eax movl 196(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 204(%esp) movl %eax, %ebp movl 200(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 20 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd 72(%esi), %mm1 movd 72(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 72(%edi) psrlq $32, %mm0 movd 76(%esi), %mm1 movd 76(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 76(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 80(%edi) adcl $0, 84(%edi) movl 192(%esp), %eax movl (%eax), %ebp movl 196(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 20 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd 72(%esi), %mm1 movd 72(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 72(%edi) psrlq $32, %mm0 movd 76(%esi), %mm1 movd 76(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 76(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 80(%edi) adcl $0, 84(%edi) addl $4, 192(%esp) addl $4, %edi decl 164(%esp) jnz Loop ########################################### ### Copy result in z movl 188(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax movl %eax, 72(%ebx) movl 76(%edi), %eax movl %eax, 76(%ebx) movl 80(%edi), %eax # carry addl $168, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc17.asm0000644023561000001540000001737712106741272013306 00000000000000# mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc17 TYPE(GSYM_PREFIX`'mulredc17,`function') GSYM_PREFIX`'mulredc17: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $144, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) ########################################### movl $17, 140(%esp) .align 32 Loop: ## compute u and store in %ebp movl 168(%esp), %eax movl 172(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 180(%esp) movl %eax, %ebp movl 176(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 17 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 68(%edi) adcl $0, 72(%edi) movl 168(%esp), %eax movl (%eax), %ebp movl 172(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 17 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 68(%edi) adcl $0, 72(%edi) addl $4, 168(%esp) addl $4, %edi decl 140(%esp) jnz Loop ########################################### ### Copy result in z movl 164(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax # carry addl $144, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/redc.asm0000644023561000001540000001600512106741272012403 00000000000000dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc. dnl dnl This file is a modified part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'ecm_redc3 TYPE(GSYM_PREFIX`'ecm_redc3,`function') GSYM_PREFIX`'ecm_redc3: push %ebp # Push registers push %edi push %esi push %ebx subl $16, %esp # SF: 2 Cpt + Jump +1 movl 44(%esp), %ecx # Read size movl 36(%esp), %edi # Read Dest Ptr movl %ecx, (%esp) # Save counter cmpl $5, %ecx jae Unroll Loop: movl 48(%esp), %ebp # Read invm movl 40(%esp), %esi # Read Source Ptr imull (%edi), %ebp # Dest[0] * invm movl %edi, 36(%esp) # Save new Dest movl 44(%esp), %ecx # Read Size (2) xorl %ebx, %ebx # Initial Carry InnerLoop: # esi: Source # edi: Dest # ebp: Multiplier # ecx: Counter movl (%esi), %eax # U1 addl $4, %edi # V1 mull %ebp # U2 addl $4, %esi # V2 addl %ebx, %eax # U3 adcl $0, %edx # U4 addl %eax, -4(%edi) # V4 adcl $0, %edx # U5 decl %ecx # V5 movl %edx, %ebx # U6 jnz InnerLoop # V6 movl 36(%esp), %edi movl %ebx, (%edi) # Save final carry decl (%esp) lea 4(%edi), %edi # Advance Dest jnz Loop # Loop End: addl $16, %esp pop %ebx pop %esi pop %edi pop %ebp ret Unroll: # %ecx Read size // %edi Dest Ptr # Precalcul du saut movl %ecx, %edx decl %ecx subl $2, %edx negl %ecx shrl $4, %edx andl $15, %ecx movl %edx, 8(%esp) # Org Cpt of 4(%esp) movl %ecx, %edx shll $4, %edx negl %ecx leal UnrollEntry (%edx, %ecx,1), %edx movl %ecx, 44(%esp) # (-size)%16 movl %edx, 12(%esp) # Org PC inside UnrollLoop: movl 48(%esp), %ebp # Read invm movl 40(%esp), %esi # Read Source Ptr imull (%edi), %ebp # Dest[0] * invm movl %edi, 36(%esp) # Save new Dest movl 44(%esp), %ecx # Read Size %16 movl 8(%esp), %edx # Read InnerLoop Cpt movl %edx, 4(%esp) # Set InnerLoop Cpt # First mull and set initial carry movl (%esi), %eax leal 4(%esi,%ecx,4), %esi mull %ebp leal (%edi,%ecx,4), %edi movl %edx, %ebx # Do the Jump inside the unrolling loop # And set up the registers differently if odd movl 12(%esp), %edx testl $1, %ecx movl %eax, %ecx cmovnz %ebx, %ecx cmovnz %eax, %ebx jmp *%edx # eax scratch # ebx carry hi # ecx carry lo # edx scratch # esi src # edi dst # ebp multiplier .align 32, 0x90 UnrollInnerLoop: addl $64, %edi UnrollEntry: # movl 0(%esi), %eax # Can't use this instruction .byte 0x8b,0x46,0x00 mull %ebp # addl %ecx, 0(%edi) # Can't use this instruction .byte 0x01,0x4f,0x00 adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 4(%esi), %eax mull %ebp addl %ebx, 4(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 8(%esi), %eax mull %ebp addl %ecx, 8(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 12(%esi), %eax mull %ebp addl %ebx, 12(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 16(%esi), %eax mull %ebp addl %ecx, 16(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 20(%esi), %eax mull %ebp addl %ebx, 20(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 24(%esi), %eax mull %ebp addl %ecx, 24(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 28(%esi), %eax mull %ebp addl %ebx, 28(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 32(%esi), %eax mull %ebp addl %ecx, 32(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 36(%esi), %eax mull %ebp addl %ebx, 36(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 40(%esi), %eax mull %ebp addl %ecx, 40(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 44(%esi), %eax mull %ebp addl %ebx, 44(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 48(%esi), %eax mull %ebp addl %ecx, 48(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 52(%esi), %eax mull %ebp addl %ebx, 52(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx movl 56(%esi), %eax mull %ebp addl %ecx, 56(%edi) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl 60(%esi), %eax mull %ebp addl %ebx, 60(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx decl 4(%esp) leal 64(%esi), %esi jns UnrollInnerLoop addl %ecx, 64(%edi) movl 36(%esp), %edi adcl $0, %ebx movl %ebx, (%edi) # Save final carry decl (%esp) lea 4(%edi), %edi # Advance Dest jnz UnrollLoop # Loop End2: addl $16, %esp pop %ebx pop %esi pop %edi pop %ebp ret ecm-6.4.4/pentium4/mulredc18.asm0000644023561000001540000002014312106741272013270 00000000000000# mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc18 TYPE(GSYM_PREFIX`'mulredc18,`function') GSYM_PREFIX`'mulredc18: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $152, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) ########################################### movl $18, 148(%esp) .align 32 Loop: ## compute u and store in %ebp movl 176(%esp), %eax movl 180(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 188(%esp) movl %eax, %ebp movl 184(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 18 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 72(%edi) adcl $0, 76(%edi) movl 176(%esp), %eax movl (%eax), %ebp movl 180(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 18 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 72(%edi) adcl $0, 76(%edi) addl $4, 176(%esp) addl $4, %edi decl 148(%esp) jnz Loop ########################################### ### Copy result in z movl 172(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax # carry addl $152, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc19.asm0000644023561000001540000002070712106741272013277 00000000000000# mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc19 TYPE(GSYM_PREFIX`'mulredc19,`function') GSYM_PREFIX`'mulredc19: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $160, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) movl $0, 132(%edi) movl $0, 136(%edi) movl $0, 140(%edi) movl $0, 144(%edi) movl $0, 148(%edi) movl $0, 152(%edi) ########################################### movl $19, 156(%esp) .align 32 Loop: ## compute u and store in %ebp movl 184(%esp), %eax movl 188(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 196(%esp) movl %eax, %ebp movl 192(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 19 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd 72(%esi), %mm1 movd 72(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 72(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 76(%edi) adcl $0, 80(%edi) movl 184(%esp), %eax movl (%eax), %ebp movl 188(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 19 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd 64(%esi), %mm1 movd 64(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 64(%edi) psrlq $32, %mm0 movd 68(%esi), %mm1 movd 68(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 68(%edi) psrlq $32, %mm0 movd 72(%esi), %mm1 movd 72(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 72(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 76(%edi) adcl $0, 80(%edi) addl $4, 184(%esp) addl $4, %edi decl 156(%esp) jnz Loop ########################################### ### Copy result in z movl 180(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax movl %eax, 64(%ebx) movl 68(%edi), %eax movl %eax, 68(%ebx) movl 72(%edi), %eax movl %eax, 72(%ebx) movl 76(%edi), %eax # carry addl $160, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc9.asm0000644023561000001540000001173212106741272013214 00000000000000# mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc9 TYPE(GSYM_PREFIX`'mulredc9,`function') GSYM_PREFIX`'mulredc9: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $80, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) ########################################### movl $9, 76(%esp) .align 32 Loop: ## compute u and store in %ebp movl 104(%esp), %eax movl 108(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 116(%esp) movl %eax, %ebp movl 112(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 9 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 36(%edi) adcl $0, 40(%edi) movl 104(%esp), %eax movl (%eax), %ebp movl 108(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 9 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 36(%edi) adcl $0, 40(%edi) addl $4, 104(%esp) addl $4, %edi decl 76(%esp) jnz Loop ########################################### ### Copy result in z movl 100(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax # carry addl $80, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc13.asm0000644023561000001540000001455712106741272013277 00000000000000# mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc13 TYPE(GSYM_PREFIX`'mulredc13,`function') GSYM_PREFIX`'mulredc13: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $112, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) ########################################### movl $13, 108(%esp) .align 32 Loop: ## compute u and store in %ebp movl 136(%esp), %eax movl 140(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 148(%esp) movl %eax, %ebp movl 144(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 13 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 52(%edi) adcl $0, 56(%edi) movl 136(%esp), %eax movl (%eax), %ebp movl 140(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 13 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 52(%edi) adcl $0, 56(%edi) addl $4, 136(%esp) addl $4, %edi decl 108(%esp) jnz Loop ########################################### ### Copy result in z movl 132(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax # carry addl $112, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc12.asm0000644023561000001540000001401312106741272013261 00000000000000# mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc12 TYPE(GSYM_PREFIX`'mulredc12,`function') GSYM_PREFIX`'mulredc12: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $104, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) ########################################### movl $12, 100(%esp) .align 32 Loop: ## compute u and store in %ebp movl 128(%esp), %eax movl 132(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 140(%esp) movl %eax, %ebp movl 136(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 12 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 48(%edi) adcl $0, 52(%edi) movl 128(%esp), %eax movl (%eax), %ebp movl 132(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 12 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 48(%edi) adcl $0, 52(%edi) addl $4, 128(%esp) addl $4, %edi decl 100(%esp) jnz Loop ########################################### ### Copy result in z movl 124(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax # carry addl $104, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc16.asm0000644023561000001540000001663312106741272013277 00000000000000# mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc16 TYPE(GSYM_PREFIX`'mulredc16,`function') GSYM_PREFIX`'mulredc16: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $136, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) movl $0, 124(%edi) movl $0, 128(%edi) ########################################### movl $16, 132(%esp) .align 32 Loop: ## compute u and store in %ebp movl 160(%esp), %eax movl 164(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 172(%esp) movl %eax, %ebp movl 168(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 16 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 64(%edi) adcl $0, 68(%edi) movl 160(%esp), %eax movl (%eax), %ebp movl 164(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 16 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd 60(%esi), %mm1 movd 60(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 60(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 64(%edi) adcl $0, 68(%edi) addl $4, 160(%esp) addl $4, %edi decl 132(%esp) jnz Loop ########################################### ### Copy result in z movl 156(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax movl %eax, 60(%ebx) movl 64(%edi), %eax # carry addl $136, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc1.asm0000644023561000001540000000243612106741272013205 00000000000000# # mp_limb_t mulredc1(mp_limb_t *z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') GSYM_PREFIX`'mulredc1: # Stack: # inv_m 20(%esp) # m 16 # y 12(%esp) # x 8 # z 4(%esp) movl 12(%esp), %eax mull 8(%esp) movl %edx, 12(%esp) movl %eax, 8(%esp) # store xy in [8(%esp):12(%esp)] mull 20(%esp) # compute u mull 16(%esp) # compute u*m addl 8(%esp), %eax # eax is 0, now (carry is important) adcl 12(%esp), %edx movl 4(%esp), %ecx movl %edx, (%ecx) adcl $0, %eax ret ecm-6.4.4/pentium4/mulredc15.asm0000644023561000001540000001606712106741272013277 00000000000000# mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc15 TYPE(GSYM_PREFIX`'mulredc15,`function') GSYM_PREFIX`'mulredc15: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $128, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) movl $0, 92(%edi) movl $0, 96(%edi) movl $0, 100(%edi) movl $0, 104(%edi) movl $0, 108(%edi) movl $0, 112(%edi) movl $0, 116(%edi) movl $0, 120(%edi) ########################################### movl $15, 124(%esp) .align 32 Loop: ## compute u and store in %ebp movl 152(%esp), %eax movl 156(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 164(%esp) movl %eax, %ebp movl 160(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 15 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 60(%edi) adcl $0, 64(%edi) movl 152(%esp), %eax movl (%eax), %ebp movl 156(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 15 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd 44(%esi), %mm1 movd 44(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 44(%edi) psrlq $32, %mm0 movd 48(%esi), %mm1 movd 48(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 48(%edi) psrlq $32, %mm0 movd 52(%esi), %mm1 movd 52(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 52(%edi) psrlq $32, %mm0 movd 56(%esi), %mm1 movd 56(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 56(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 60(%edi) adcl $0, 64(%edi) addl $4, 152(%esp) addl $4, %edi decl 124(%esp) jnz Loop ########################################### ### Copy result in z movl 148(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax movl %eax, 44(%ebx) movl 48(%edi), %eax movl %eax, 48(%ebx) movl 52(%edi), %eax movl %eax, 52(%ebx) movl 56(%edi), %eax movl %eax, 56(%ebx) movl 60(%edi), %eax # carry addl $128, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc10.asm0000644023561000001540000001250312106741272013261 00000000000000# mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc10 TYPE(GSYM_PREFIX`'mulredc10,`function') GSYM_PREFIX`'mulredc10: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $88, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) ########################################### movl $10, 84(%esp) .align 32 Loop: ## compute u and store in %ebp movl 112(%esp), %eax movl 116(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 124(%esp) movl %eax, %ebp movl 120(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 10 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 40(%edi) adcl $0, 44(%edi) movl 112(%esp), %eax movl (%eax), %ebp movl 116(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 10 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 40(%edi) adcl $0, 44(%edi) addl $4, 112(%esp) addl $4, %edi decl 84(%esp) jnz Loop ########################################### ### Copy result in z movl 108(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax # carry addl $88, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/Makefile.am0000644023561000001540000000224712106741272013023 00000000000000MULREDC = mulredc1.asm mulredc2.asm mulredc3.asm mulredc4.asm mulredc5.asm \ mulredc6.asm mulredc7.asm mulredc8.asm mulredc9.asm mulredc10.asm \ mulredc11.asm mulredc12.asm mulredc13.asm mulredc14.asm \ mulredc15.asm mulredc16.asm mulredc17.asm mulredc18.asm \ mulredc19.asm mulredc20.asm EXTRA_DIST = Makefile.dev README autogen.py generate_all noinst_LTLIBRARIES = libmulredc.la noinst_HEADERS = mulredc.h # This library definition also causes the mulredc[n].asm and redc.asm files # to go in the distribution - no need for having them in EXTRA_DIST libmulredc_la_SOURCES = $(MULREDC) redc.asm # It's actually the .s files that depend on config.m4, but automake # knows them only as intermediate files, not as targets. Adding the # dependency to libmulredc.la should work so long as no stale .s # files exist. libmulredc_la_DEPENDENCIES = $(top_builddir)/config.m4 # The asm code does not depend on any libraries except libc for abort() # if assertions are enabled LIBS = LDFLAGS = .asm.s: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.s .asm.S: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.S ecm-6.4.4/pentium4/mulredc4.asm0000644023561000001540000000635012106741272013207 00000000000000# mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc4 TYPE(GSYM_PREFIX`'mulredc4,`function') GSYM_PREFIX`'mulredc4: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $40, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) ########################################### movl $4, 36(%esp) .align 32 Loop: ## compute u and store in %ebp movl 64(%esp), %eax movl 68(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 76(%esp) movl %eax, %ebp movl 72(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 4 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 16(%edi) adcl $0, 20(%edi) movl 64(%esp), %eax movl (%eax), %ebp movl 68(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 4 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 16(%edi) adcl $0, 20(%edi) addl $4, 64(%esp) addl $4, %edi decl 36(%esp) jnz Loop ########################################### ### Copy result in z movl 60(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax # carry addl $40, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc8.asm0000644023561000001540000001116412106741272013212 00000000000000# mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc8 TYPE(GSYM_PREFIX`'mulredc8,`function') GSYM_PREFIX`'mulredc8: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $72, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) ########################################### movl $8, 68(%esp) .align 32 Loop: ## compute u and store in %ebp movl 96(%esp), %eax movl 100(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 108(%esp) movl %eax, %ebp movl 104(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 8 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 32(%edi) adcl $0, 36(%edi) movl 96(%esp), %eax movl (%eax), %ebp movl 100(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 8 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 32(%edi) adcl $0, 36(%edi) addl $4, 96(%esp) addl $4, %edi decl 68(%esp) jnz Loop ########################################### ### Copy result in z movl 92(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax # carry addl $72, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/README0000644023561000001540000000133412106741272011643 00000000000000mulredc[1..20].s are size-specific asm code for mulredc. These are generated by the Python script autogen.py. In order to avoid dependency of the package to Python, this generation is not done automatically with the autoconf/automake stuff. If you need to regenerate them, the syntax is ./autogen.py 3 > mulredc3.s And you can generate all of them with the shell script ./generate_all This asm code uses MMX/SSE2 instructions and might not work on old x86 computers. If you have this problem, you should reconfigure with the --disable-asm-redc option. redc.asm is a version of redc separated from the multiplication, since there are cases where it is needed. test_mulredc.c, bench.c and the Makefile are for developpement. ecm-6.4.4/pentium4/mulredc11.asm0000644023561000001540000001324512106741272013266 00000000000000# mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc11 TYPE(GSYM_PREFIX`'mulredc11,`function') GSYM_PREFIX`'mulredc11: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $96, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) movl $0, 60(%edi) movl $0, 64(%edi) movl $0, 68(%edi) movl $0, 72(%edi) movl $0, 76(%edi) movl $0, 80(%edi) movl $0, 84(%edi) movl $0, 88(%edi) ########################################### movl $11, 92(%esp) .align 32 Loop: ## compute u and store in %ebp movl 120(%esp), %eax movl 124(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 132(%esp) movl %eax, %ebp movl 128(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 11 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 44(%edi) adcl $0, 48(%edi) movl 120(%esp), %eax movl (%eax), %ebp movl 124(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 11 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd 28(%esi), %mm1 movd 28(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 28(%edi) psrlq $32, %mm0 movd 32(%esi), %mm1 movd 32(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 32(%edi) psrlq $32, %mm0 movd 36(%esi), %mm1 movd 36(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 36(%edi) psrlq $32, %mm0 movd 40(%esi), %mm1 movd 40(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 40(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 44(%edi) adcl $0, 48(%edi) addl $4, 120(%esp) addl $4, %edi decl 92(%esp) jnz Loop ########################################### ### Copy result in z movl 116(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax movl %eax, 28(%ebx) movl 32(%edi), %eax movl %eax, 32(%ebx) movl 36(%edi), %eax movl %eax, 36(%ebx) movl 40(%edi), %eax movl %eax, 40(%ebx) movl 44(%edi), %eax # carry addl $96, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/pentium4/mulredc7.asm0000644023561000001540000001041712106741272013211 00000000000000# mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Stack: # inv_m ## parameters # m # y # x # z (4*(2k+7))%esp # ??? (1 limb???) # ebp ## pushed registers (4*(2k+5))%esp # edi # esi # ebx # ... ## counter (1 mp_limb_t) (4*(2k+1))%esp # ... ## tmp space (2*k+1 mp_limb_t) include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc7 TYPE(GSYM_PREFIX`'mulredc7,`function') GSYM_PREFIX`'mulredc7: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $64, %esp movl %esp, %edi ### set tmp[0..2k+1[ to 0 movl $0, (%edi) movl $0, 4(%edi) movl $0, 8(%edi) movl $0, 12(%edi) movl $0, 16(%edi) movl $0, 20(%edi) movl $0, 24(%edi) movl $0, 28(%edi) movl $0, 32(%edi) movl $0, 36(%edi) movl $0, 40(%edi) movl $0, 44(%edi) movl $0, 48(%edi) movl $0, 52(%edi) movl $0, 56(%edi) ########################################### movl $7, 60(%esp) .align 32 Loop: ## compute u and store in %ebp movl 88(%esp), %eax movl 92(%esp), %esi movl (%eax), %eax mull (%esi) addl (%edi), %eax mull 100(%esp) movl %eax, %ebp movl 96(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 7 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 28(%edi) adcl $0, 32(%edi) movl 88(%esp), %eax movl (%eax), %ebp movl 92(%esp), %esi ### addmul1: src[0] is (%esi) ### dst[0] is (%edi) ### mult is %ebp ### k is 7 ### kills %eax, %edx and mmx regs ### dst[0,k[ += mult*src[0,k[ plus carry put in ecx pxor %mm0, %mm0 movd %ebp, %mm7 movd (%esi), %mm1 movd (%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, (%edi) psrlq $32, %mm0 movd 4(%esi), %mm1 movd 4(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 4(%edi) psrlq $32, %mm0 movd 8(%esi), %mm1 movd 8(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 8(%edi) psrlq $32, %mm0 movd 12(%esi), %mm1 movd 12(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 12(%edi) psrlq $32, %mm0 movd 16(%esi), %mm1 movd 16(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 16(%edi) psrlq $32, %mm0 movd 20(%esi), %mm1 movd 20(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 20(%edi) psrlq $32, %mm0 movd 24(%esi), %mm1 movd 24(%edi), %mm2 pmuludq %mm7, %mm1 paddq %mm1, %mm2 paddq %mm2, %mm0 movd %mm0, 24(%edi) psrlq $32, %mm0 movd %mm0, %ecx ### carry limb is in %ecx addl %ecx, 28(%edi) adcl $0, 32(%edi) addl $4, 88(%esp) addl $4, %edi decl 60(%esp) jnz Loop ########################################### ### Copy result in z movl 84(%esp), %ebx movl (%edi), %eax movl %eax, (%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl 8(%edi), %eax movl %eax, 8(%ebx) movl 12(%edi), %eax movl %eax, 12(%ebx) movl 16(%edi), %eax movl %eax, 16(%ebx) movl 20(%edi), %eax movl %eax, 20(%ebx) movl 24(%edi), %eax movl %eax, 24(%ebx) movl 28(%edi), %eax # carry addl $64, %esp popl %ebx popl %esi popl %edi popl %ebp emms ret ecm-6.4.4/sp.c0000644023561000001540000000550312106741273010007 00000000000000/* sp.c - "small prime" functions that don't need to be inlined Copyright 2005, 2006, 2007, 2008, 2009, 2010 Dave Newman, Jason Papadopoulos, Alexander Kruppa, Paul Zimmermann. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include /* for stderr */ #include #include "sp.h" /* Test if m is a base "a" strong probable prime */ int sp_spp (sp_t a, sp_t m, sp_t d) { sp_t r, s, t, e; if (m == a) return 1; /* Set e * 2^s = m-1, e odd */ for (s = 0, e = m - 1; !(e & 1); s++, e >>= 1); t = sp_pow (a, e, m, d); if (t == 1) return 1; for (r = 0; r < s; r++) { if (t == m - 1) return 1; t = sp_sqr (t, m, d); } return 0; } /* Test if x is a prime, return 1 if it is. Note this only works on sp's, i.e. we need the top bit of x set */ int sp_prime (sp_t x) { sp_t d; if (!(x & 1)) return 0; if (x < SP_MIN) return 1; sp_reciprocal (d, x); if (SP_NUMB_BITS <= 32) { /* 32-bit primality test * See http://primes.utm.edu/prove/prove2_3.html */ if (!sp_spp (2, x, d) || !sp_spp (7, x, d) || !sp_spp (61, x, d)) return 0; } else { ASSERT (SP_NUMB_BITS <= 64); /* 64-bit primality test * follows from results by Jaeschke, "On strong pseudoprimes to several * bases" Math. Comp. 61 (1993) p916 */ if (!sp_spp (2, x, d) || !sp_spp (3, x, d) || !sp_spp (5, x, d) || !sp_spp (7, x, d) || !sp_spp (11, x, d) || !sp_spp (13, x, d) || !sp_spp (17, x, d) || ! sp_spp (19, x, d) || !sp_spp (23, x, d) || !sp_spp (29, x, d)) return 0; } return 1; } #define CACHE_LINE_SIZE 64 void * sp_aligned_malloc (size_t len) { void *ptr, *aligned_ptr; size_t addr; ptr = malloc (len + CACHE_LINE_SIZE); if (ptr == NULL) return NULL; addr = (size_t)ptr; addr = CACHE_LINE_SIZE - (addr % CACHE_LINE_SIZE); aligned_ptr = (void *)((char *)ptr + addr); *( (void **)aligned_ptr - 1 ) = ptr; return aligned_ptr; } void sp_aligned_free (void *newptr) { void *ptr; if (newptr == NULL) return; ptr = *( (void **)newptr - 1 ); free (ptr); } ecm-6.4.4/ecm-params.h.mips64el0000644023561000001540000000153012106741273013055 00000000000000/* those parameters were obtained on gcc42.fsffrance.org with ecm-6.4.1-rc3 gmp-5.0.2, and gcc 4.3.1 -O2 -mabi=n32 (mips64el-unknown-linux-gnu) */ /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1} #define MPZMOD_THRESHOLD 23 #define REDC_THRESHOLD 512 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 12 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 13 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 16 #define POLYINVERT_NTT_THRESHOLD 256 #define POLYEVALT_NTT_THRESHOLD 256 #define MPZSPV_NORMALISE_STRIDE 128 ecm-6.4.4/ecm-params.h.alpha-ev50000644023561000001540000000071112106741273013174 00000000000000#define MPZMOD_THRESHOLD 86 #define REDC_THRESHOLD 182 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 10, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 13, 1, 1, 1, 1, 1, 1, 1, 17} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 512 #define POLYINVERT_NTT_THRESHOLD 2048 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 128 ecm-6.4.4/rho.c0000644023561000001540000006443212106741273010163 00000000000000/* Dickman's rho function (to compute probability of success of ecm). Copyright 2004, 2005, 2006, 2008, 2009, 2010, 2011 Alexander Kruppa, Paul Zimmermann. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #if defined(TESTDRIVE) #define _ISOC99_SOURCE 1 #endif #if defined(DEBUG_NUMINTEGRATE) || defined(TESTDRIVE) # include #endif #include #include #if defined(TESTDRIVE) #include #include "primegen.h" #endif #if defined(TESTDRIVE) && defined(HAVE_LIBGSL) #include #include #include #endif #include "ecm-impl.h" /* For Suyama's curves, we have a known torsion factor of 12 = 2^2*3^1, and an average extra exponent of 1/2 for 2, and 1/3 for 3 due to the probability that the group order divided by 12 is divisible by 2 or 3, thus on average we should have 2^2.5*3^1.333 ~ 24.5, however experimentally we have 2^3.323*3^1.687 ~ 63.9 (see Alexander Kruppa's thesis, Table 5.1 page 96, row sigma=2, http://tel.archives-ouvertes.fr/tel-00477005/en/). The exp(ECM_EXTRA_SMOOTHNESS) value takes into account the extra smoothness with respect to a random number. */ #ifndef ECM_EXTRA_SMOOTHNESS #define ECM_EXTRA_SMOOTHNESS 3.134 #endif #define M_PI_SQR 9.869604401089358619 /* Pi^2 */ #define M_PI_SQR_6 1.644934066848226436 /* Pi^2/6 */ /* gsl_math.h defines M_EULER */ #ifndef M_EULER #define M_EULER 0.577215664901532861 #endif #define M_EULER_1 0.422784335098467139 /* 1 - Euler */ #ifndef MAX #define MAX(x,y) ((x) > (y) ? (x) : (y)) #endif #ifndef MIN #define MIN(x,y) ((x) < (y) ? (x) : (y)) #endif void rhoinit (int, int); /* used in stage2.c */ static double *rhotable = NULL; static int invh = 0; static double h = 0.; static int tablemax = 0; #if defined(TESTDRIVE) #define PRIME_PI_MAX 10000 #define PRIME_PI_MAP(x) (((x)+1)/2) /* The number of primes up to i. Use prime_pi[PRIME_PI_MAP(i)]. Only correct for i >= 2. */ static unsigned int prime_pi[PRIME_PI_MAP(PRIME_PI_MAX)+1]; #endif /* Fixme: need prime generating funcion without static state variables */ const unsigned char primemap[667] = { 254, 223, 239, 126, 182, 219, 61, 249, 213, 79, 30, 243, 234, 166, 237, 158, 230, 12, 211, 211, 59, 221, 89, 165, 106, 103, 146, 189, 120, 30, 166, 86, 86, 227, 173, 45, 222, 42, 76, 85, 217, 163, 240, 159, 3, 84, 161, 248, 46, 253, 68, 233, 102, 246, 19, 58, 184, 76, 43, 58, 69, 17, 191, 84, 140, 193, 122, 179, 200, 188, 140, 79, 33, 88, 113, 113, 155, 193, 23, 239, 84, 150, 26, 8, 229, 131, 140, 70, 114, 251, 174, 101, 146, 143, 88, 135, 210, 146, 216, 129, 101, 38, 227, 160, 17, 56, 199, 38, 60, 129, 235, 153, 141, 81, 136, 62, 36, 243, 51, 77, 90, 139, 28, 167, 42, 180, 88, 76, 78, 38, 246, 25, 130, 220, 131, 195, 44, 241, 56, 2, 181, 205, 205, 2, 178, 74, 148, 12, 87, 76, 122, 48, 67, 11, 241, 203, 68, 108, 36, 248, 25, 1, 149, 168, 92, 115, 234, 141, 36, 150, 43, 80, 166, 34, 30, 196, 209, 72, 6, 212, 58, 47, 116, 156, 7, 106, 5, 136, 191, 104, 21, 46, 96, 85, 227, 183, 81, 152, 8, 20, 134, 90, 170, 69, 77, 73, 112, 39, 210, 147, 213, 202, 171, 2, 131, 97, 5, 36, 206, 135, 34, 194, 169, 173, 24, 140, 77, 120, 209, 137, 22, 176, 87, 199, 98, 162, 192, 52, 36, 82, 174, 90, 64, 50, 141, 33, 8, 67, 52, 182, 210, 182, 217, 25, 225, 96, 103, 26, 57, 96, 208, 68, 122, 148, 154, 9, 136, 131, 168, 116, 85, 16, 39, 161, 93, 104, 30, 35, 200, 50, 224, 25, 3, 68, 115, 72, 177, 56, 195, 230, 42, 87, 97, 152, 181, 28, 10, 104, 197, 129, 143, 172, 2, 41, 26, 71, 227, 148, 17, 78, 100, 46, 20, 203, 61, 220, 20, 197, 6, 16, 233, 41, 177, 130, 233, 48, 71, 227, 52, 25, 195, 37, 10, 48, 48, 180, 108, 193, 229, 70, 68, 216, 142, 76, 93, 34, 36, 112, 120, 146, 137, 129, 130, 86, 38, 27, 134, 233, 8, 165, 0, 211, 195, 41, 176, 194, 74, 16, 178, 89, 56, 161, 29, 66, 96, 199, 34, 39, 140, 200, 68, 26, 198, 139, 130, 129, 26, 70, 16, 166, 49, 9, 240, 84, 47, 24, 210, 216, 169, 21, 6, 46, 12, 246, 192, 14, 80, 145, 205, 38, 193, 24, 56, 101, 25, 195, 86, 147, 139, 42, 45, 214, 132, 74, 97, 10, 165, 44, 9, 224, 118, 196, 106, 60, 216, 8, 232, 20, 102, 27, 176, 164, 2, 99, 54, 16, 49, 7, 213, 146, 72, 66, 18, 195, 138, 160, 159, 45, 116, 164, 130, 133, 120, 92, 13, 24, 176, 97, 20, 29, 2, 232, 24, 18, 193, 1, 73, 28, 131, 48, 103, 51, 161, 136, 216, 15, 12, 244, 152, 136, 88, 215, 102, 66, 71, 177, 22, 168, 150, 8, 24, 65, 89, 21, 181, 68, 42, 82, 225, 179, 170, 161, 89, 69, 98, 85, 24, 17, 165, 12, 163, 60, 103, 0, 190, 84, 214, 10, 32, 54, 107, 130, 12, 21, 8, 126, 86, 145, 1, 120, 208, 97, 10, 132, 168, 44, 1, 87, 14, 86, 160, 80, 11, 152, 140, 71, 108, 32, 99, 16, 196, 9, 228, 12, 87, 136, 11, 117, 11, 194, 82, 130, 194, 57, 36, 2, 44, 86, 37, 122, 49, 41, 214, 163, 32, 225, 177, 24, 176, 12, 138, 50, 193, 17, 50, 9, 197, 173, 48, 55, 8, 188, 145, 130, 207, 32, 37, 107, 156, 48, 143, 68, 38, 70, 106, 7, 73, 142, 9, 88, 16, 2, 37, 197, 196, 66, 90, 128, 160, 128, 60, 144, 40, 100, 20, 225, 3, 132, 81, 12, 46, 163, 138, 164, 8, 192, 71, 126, 211, 43, 3, 205, 84, 42, 0, 4, 179, 146, 108, 66, 41, 76, 131, 193, 146, 204, 28}; #ifdef TESTDRIVE unsigned long gcd (unsigned long a, unsigned long b) { unsigned long t; while (b != 0) { t = a % b; a = b; b = t; } return a; } unsigned long eulerphi (unsigned long n) { unsigned long phi = 1, p; for (p = 2; p * p <= n; p += 2) { if (n % p == 0) { phi *= p - 1; n /= p; while (n % p == 0) { phi *= p; n /= p; } } if (p == 2) p--; } /* now n is prime */ return (n == 1) ? phi : phi * (n - 1); } /* The number of positive integers up to x that have no prime factor up to y, for x >= y >= 2. Uses Buchstab's identity */ unsigned long Buchstab_Phi(unsigned long x, unsigned long y) { unsigned long p, s; primegen pg[1]; if (x < 1) return 0; if (x <= y) return 1; #if 0 if (x < y^2) return(1 + primepi(x) - primepi (y))); #endif s = 1; primegen_init (pg); primegen_skipto (pg, y + 1); for (p = primegen_next(pg); p <= x; p = primegen_next(pg)) s += Buchstab_Phi(x / p, p - 1); return (s); } /* The number of positive integers up to x that have no prime factor greter than y, for x >= y >= 2. Uses Buchstab's identity */ unsigned long Buchstab_Psi(const unsigned long x, const unsigned long y) { unsigned long r, p; primegen pg[1]; if (x <= y) return (x); if (y == 1UL) return (1); /* If y^2 > x, then Psi(x,y) = x - \sum_{y < p < x, p prime} floor(x/p) We separate the sum into ranges where floor(x/p) = k, which is x/(k+1) < p <= x/k. We also need to satisfy y < p, so we need k < x/y - 1, or k_max = ceil (x/y) - 2. The primes y < p <= x/(k_max + 1) are summed separately. */ if (x <= PRIME_PI_MAX && x < y * y) { unsigned long kmax = x / y - 1; unsigned long s1, s2, k; s1 = (kmax + 1) * (prime_pi [PRIME_PI_MAP(x / (kmax + 1))] - prime_pi [PRIME_PI_MAP(y)]); s2 = 0; for (k = 1; k <= kmax; k++) s2 += prime_pi[PRIME_PI_MAP(x / k)]; s2 -= kmax * prime_pi [PRIME_PI_MAP(x / (kmax+1))]; return (x - s1 - s2); } r = 1; primegen_init (pg); for (p = primegen_next(pg); p <= y; p = primegen_next(pg)) r += Buchstab_Psi (x / p, p); return (r); } #endif /* TESTDRIVE */ #if defined(TESTDRIVE) && defined(HAVE_LIBGSL) static double Li (const double x) { return (- gsl_sf_expint_E1 (- log(x))); } #endif /* Evaluate dilogarithm via the sum \Li_{2}(z)=\sum_{k=1}^{\infty} \frac{z^k}{k^2}, see http://mathworld.wolfram.com/Dilogarithm.html Assumes |z| <= 0.5, for which the sum converges quickly. */ static double dilog_series (const double z) { double r = 0.0, zk; /* zk = z^k */ int k, k2; /* k2 = k^2 */ /* Doubles have 53 bits in significand, with |z| <= 0.5 the k+1-st term is <= 1/(2^k k^2) of the result, so 44 terms should do */ for (k = 1, k2 = 1, zk = z; k <= 44; k2 += 2 * k + 1, k++, zk *= z) r += zk / (double) k2; return r; } static double dilog (double x) { ASSERT(x <= -1.0); /* dilog(1-x) is called from rhoexact for 2 < x <= 3 */ if (x <= -2.0) return -dilog_series (1./x) - M_PI_SQR_6 - 0.5 * log(-1./x) * log(-1./x); else /* x <= -1.0 */ { /* L2(z) = -L2(1 - z) + 1/6 * Pi^2 - ln(1 - z)*ln(z) L2(z) = -L2(1/z) - 1/6 * Pi^2 - 0.5*ln^2(-1/z) -> L2(z) = -(-L2(1/(1-z)) - 1/6 * Pi^2 - 0.5*ln^2(-1/(1-z))) + 1/6 * Pi^2 - ln(1 - z)*ln(z) = L2(1/(1-z)) - 1/6 * Pi^2 + 0.5*ln(1 - z)^2 - ln(1 - z)*ln(-z) z in [-1, -2) -> 1/(1-z) in [1/2, 1/3) */ double log1x = log (1. - x); return dilog_series (1. / (1. - x)) - M_PI_SQR_6 + log1x * (0.5 * log1x - log (-x)); } } #if 0 static double L2 (double x) { return log (x) * (1 - log (x-1)) + M_PI_SQR_6 - dilog (1 - x); } #endif static double rhoexact (double x) { ASSERT(x <= 3.); if (x <= 0.) return 0.; if (x <= 1.) return 1.; if (x <= 2.) return 1. - log (x); if (x <= 3.) /* 2 < x <= 3 thus -2 <= 1-x < -1 */ return 1. - log (x) * (1. - log (x - 1.)) + dilog (1. - x) + 0.5 * M_PI_SQR_6; return 0.; /* x > 3. and asserting not enabled: bail out with 0. */ } #if defined(TESTDRIVE) && defined(HAVE_LIBGSL) /* The Buchstab omega(x) function, exact for x <= 4 where it can be evaluated without numerical integration, and approximated by exp(gamma) for larger x. */ static double Buchstab_omega (const double x) { /* magic = dilog(-1) + 1 = Pi^2/12 + 1 */ const double magic = 1.82246703342411321824; if (x < 1.) return (0.); if (x <= 2.) return (1. / x); if (x <= 3.) return ((log (x - 1.) + 1.) / x); if (x <= 4.) return ((dilog(2. - x) + (1. + log(x - 2.)) * log(x - 1.) + magic) / x); /* If argument is out of range, return the limiting value for $x->\infty$: e^-gamma. For x only a little larger than 4., this has relative error 2.2e-6, for larger x the error rapidly drops further */ return 0.56145948356688516982; } #endif void rhoinit (int parm_invh, int parm_tablemax) { int i; if (parm_invh == invh && parm_tablemax == tablemax) return; if (rhotable != NULL) { free (rhotable); rhotable = NULL; invh = 0; h = 0.; tablemax = 0; } /* The integration below expects 3 * invh > 4 */ if (parm_tablemax == 0 || parm_invh < 2) return; invh = parm_invh; h = 1. / (double) invh; tablemax = parm_tablemax; rhotable = (double *) malloc (parm_invh * parm_tablemax * sizeof (double)); if (rhotable == NULL) { fprintf (stderr, "Cannot allocate memory in rhoinit\n"); exit (1); } for (i = 0; i < (3 < parm_tablemax ? 3 : parm_tablemax) * invh; i++) rhotable[i] = rhoexact (i * h); for (i = 3 * invh; i < parm_tablemax * invh; i++) { /* rho(i*h) = 1 - \int_{1}^{i*h} rho(x-1)/x dx = rho((i-4)*h) - \int_{(i-4)*h}^{i*h} rho(x-1)/x dx */ rhotable[i] = rhotable[i - 4] - 2. / 45. * ( 7. * rhotable[i - invh - 4] / (double)(i - 4) + 32. * rhotable[i - invh - 3] / (double)(i - 3) + 12. * rhotable[i - invh - 2] / (double)(i - 2) + 32. * rhotable[i - invh - 1] / (double)(i - 1) + 7. * rhotable[i - invh] / (double)i ); if (rhotable[i] < 0.) { #ifndef DEBUG_NUMINTEGRATE rhotable[i] = 0.; #else printf (stderr, "rhoinit: rhotable[%d] = %.16f\n", i, rhotable[i]); exit (EXIT_FAILURE); #endif } } } static double dickmanrho (double alpha) { if (alpha <= 3.) return rhoexact (alpha); if (alpha < tablemax) { int a = floor (alpha * invh); double rho1 = rhotable[a]; double rho2 = (a + 1) < tablemax * invh ? rhotable[a + 1] : 0; return rho1 + (rho2 - rho1) * (alpha * invh - (double)a); } return 0.; } #if 0 static double dickmanrhosigma (double alpha, double x) { if (alpha <= 0.) return 0.; if (alpha <= 1.) return 1.; if (alpha < tablemax) return dickmanrho (alpha) + M_EULER_1 * dickmanrho (alpha - 1.) / log (x); return 0.; } static double dickmanrhosigma_i (int ai, double x) { if (ai <= 0) return 0.; if (ai <= invh) return 1.; if (ai < tablemax * invh) return rhotable[ai] - M_EULER * rhotable[ai - invh] / log(x); return 0.; } #endif static double dickmanlocal (double alpha, double x) { if (alpha <= 0.) return 0.; if (alpha <= 1.) return 1.; if (alpha < tablemax) return dickmanrho (alpha) - M_EULER * dickmanrho (alpha - 1.) / log (x); return 0.; } static double dickmanlocal_i (int ai, double x) { if (ai <= 0) return 0.; if (ai <= invh) return 1.; if (ai <= 2 * invh && ai < tablemax * invh) return rhotable[ai] - M_EULER / log (x); if (ai < tablemax * invh) { double logx = log (x); return rhotable[ai] - (M_EULER * rhotable[ai - invh] + M_EULER_1 * rhotable[ai - 2 * invh] / logx) / logx; } return 0.; } static int isprime(unsigned long n) { unsigned int r; if (n % 2 == 0) return (n == 2); if (n % 3 == 0) return (n == 3); if (n % 5 == 0) return (n == 5); if (n / 30 >= sizeof (primemap)) abort(); r = n % 30; /* 8 possible values: 1,7,11,13,17,19,23,29 */ r = (r * 16 + r) / 64; /* maps the 8 values onto 0, ..., 7 */ return ((primemap[n / 30] & (1 << r)) != 0); } static double dickmanmu_sum (const unsigned long B1, const unsigned long B2, const double x) { double s = 0.; const double logB1 = 1. / log(B1); const double logx = log(x); unsigned long p; for (p = B1 + 1; p <= B2; p++) if (isprime(p)) s += dickmanlocal ((logx - log(p)) * logB1, x / p) / p; return (s); } static double dickmanmu (double alpha, double beta, double x) { double a, b, sum; int ai, bi, i; ai = ceil ((alpha - beta) * invh); if (ai > tablemax * invh) ai = tablemax * invh; a = (double) ai * h; bi = floor ((alpha - 1.) * invh); if (bi > tablemax * invh) bi = tablemax * invh; b = (double) bi * h; sum = 0.; for (i = ai + 1; i < bi; i++) sum += dickmanlocal_i (i, x) / (alpha - i * h); sum += 0.5 * dickmanlocal_i (ai, x) / (alpha - a); sum += 0.5 * dickmanlocal_i (bi, x) / (alpha - b); sum *= h; sum += (a - alpha + beta) * 0.5 * (dickmanlocal_i (ai, x) / (alpha - a) + dickmanlocal (alpha - beta, x) / beta); sum += (alpha - 1. - b) * 0.5 * (dickmanlocal (alpha - 1., x) + dickmanlocal_i (bi, x) / (alpha - b)); return sum; } static double brentsuyama (double B1, double B2, double N, double nr) { double a, alpha, beta, sum; int ai, i; alpha = log (N) / log (B1); beta = log (B2) / log (B1); ai = floor ((alpha - beta) * invh); if (ai > tablemax * invh) ai = tablemax * invh; a = (double) ai * h; sum = 0.; for (i = 1; i < ai; i++) sum += dickmanlocal_i (i, N) / (alpha - i * h) * (1 - exp (-nr * pow (B1, (-alpha + i * h)))); sum += 0.5 * (1 - exp(-nr / pow (B1, alpha))); sum += 0.5 * dickmanlocal_i (ai, N) / (alpha - a) * (1 - exp(-nr * pow (B1, (-alpha + a)))); sum *= h; sum += 0.5 * (alpha - beta - a) * (dickmanlocal_i (ai, N) / (alpha - a) + dickmanlocal (alpha - beta, N) / beta); return sum; } static double brsudickson (double B1, double B2, double N, double nr, int S) { int i, f; double sum; sum = 0; f = eulerphi (S) / 2; for (i = 1; i <= S / 2; i++) if (gcd (i, S) == 1) sum += brentsuyama (B1, B2, N, nr * (gcd (i - 1, S) + gcd (i + 1, S) - 4) / 2); return sum / (double)f; } static double brsupower (double B1, double B2, double N, double nr, int S) { int i, f; double sum; sum = 0; f = eulerphi (S); for (i = 1; i < S; i++) if (gcd (i, S) == 1) sum += brentsuyama (B1, B2, N, nr * (gcd (i - 1, S) - 2)); return sum / (double)f; } /* Assume N is as likely smooth as a number around N/exp(delta) */ static double prob (double B1, double B2, double N, double nr, int S, double delta) { const double sumthresh = 20000.; double alpha, beta, stage1, stage2, brsu; const double effN = N / exp (delta); ASSERT(rhotable != NULL); /* What to do if rhotable is not initialised and asserting is not enabled? For now, bail out with 0. result. Not really pretty, either */ if (rhotable == NULL) return 0.; if (B1 < 2. || N <= 1.) return 0.; if (effN <= B1) return 1.; #ifdef TESTDRIVE printf ("B1 = %f, B2 = %f, N = %.0f, nr = %f, S = %d\n", B1, B2, N, nr, S); #endif alpha = log (effN) / log (B1); stage1 = dickmanlocal (alpha, effN); stage2 = 0.; if (B2 > B1) { if (B1 < sumthresh) { stage2 += dickmanmu_sum (B1, MIN(B2, sumthresh), effN); beta = log (B2) / log (MIN(B2, sumthresh)); } else beta = log (B2) / log (B1); if (beta > 1.) stage2 += dickmanmu (alpha, beta, effN); } brsu = 0.; if (S < -1) brsu = brsudickson (B1, B2, effN, nr, -S * 2); if (S > 1) brsu = brsupower (B1, B2, effN, nr, S * 2); #ifdef TESTDRIVE printf ("stage 1 : %f, stage 2 : %f, Brent-Suyama : %f\n", stage1, stage2, brsu); #endif return (stage1 + stage2 + brsu) > 0. ? (stage1 + stage2 + brsu) : 0.; } double ecmprob (double B1, double B2, double N, double nr, int S) { return prob (B1, B2, N, nr, S, ECM_EXTRA_SMOOTHNESS); } double pm1prob (double B1, double B2, double N, double nr, int S, const mpz_t go) { mpz_t cof; /* A prime power q^k divides p-1, p prime, with probability 1/(q^k-q^(k-1)) not with probability 1/q^k as for random numbers. This is taken into account by the "smoothness" value here; a prime p-1 is about as likely smooth as a random number around (p-1)/exp(smoothness). smoothness = \sum_{q in Primes} log(q)/(q-1)^2 */ double smoothness = 1.2269688; unsigned long i; if (go != NULL && mpz_cmp_ui (go, 1UL) > 0) { mpz_init (cof); mpz_set (cof, go); for (i = 2; i < 100; i++) if (mpz_divisible_ui_p (cof, i)) { /* If we know that q divides p-1 with probability 1, we need to adjust the smoothness parameter */ smoothness -= log ((double) i) / (double) ((i-1)*(i-1)); /* printf ("pm1prob: Dividing out %lu\n", i); */ while (mpz_divisible_ui_p (cof, i)) mpz_tdiv_q_ui (cof, cof, i); } /* printf ("pm1prob: smoothness after dividing out go primes < 100: %f\n", smoothness); */ return prob (B1, B2, N, nr, S, smoothness + log(mpz_get_d (cof))); mpz_clear (cof); } return prob (B1, B2, N, nr, S, smoothness); } /* Compute probability for primes p == r (mod m) */ double pm1prob_rm (double B1, double B2, double N, double nr, int S, unsigned long r, unsigned long m) { unsigned long cof; double smoothness = 1.2269688; unsigned long p; cof = m; for (p = 2UL; p < 100UL; p++) if (cof % p == 0UL) /* For each prime in m */ { unsigned long cof_r, k, i; /* Divisibility by i is determined by r and m. We need to adjust the smoothness parameter. In P-1, we had estimated the expected value for the exponent of p as p/(p-1)^2. Undo that. */ smoothness -= (double)p / ((p-1)*(p-1)) * log ((double) p); /* The expected value for the exponent of this prime is k s.t. p^k || r, plus 1/(p-1) if p^k || m as well */ cof_r = gcd (r - 1UL, m); for (k = 0UL; cof_r % p == 0UL; k++) cof_r /= p; smoothness += k * log ((double) p); cof_r = m; for (i = 0UL; cof_r % p == 0UL; i++) cof_r /= p; if (i == k) smoothness += (1./(p - 1.) * log ((double) p)); while (cof % p == 0UL) cof /= p; printf ("pm1prob_rm: p = %lu, k = %lu, i = %lu, new smoothness = %f\n", p, i, k, smoothness); } return prob (B1, B2, N, nr, S, smoothness); } /* The \Phi(x,y) function gives the number of natural numbers <= x that have no prime factor <= y, see Tenenbaum, "Introduction the analytical and probabilistic number theory", III.6. This function estimates the \Phi(x,y) function via eq. (48) of the 1st edition resp. equation (6.49) of the 3rd edition of Tenenbaum's book. */ #if defined(TESTDRIVE) && defined(HAVE_LIBGSL) static double integrand1 (double x, double *y) { return pow (*y, x) / x * log(x-1.); } static double integrand2 (double v, double *y) { return Buchstab_omega (v) * pow (*y, v); } /* Return approximate number of integers n with x1 < n <= x2 that have no prime factor <= y */ double no_small_prime (double x1, double x2, double y) { double u1, u2; ASSERT (x1 >= 2.); ASSERT (x2 >= x1); ASSERT (y >= 2.); if (x1 == x2 || x2 <= y) return 0.; if (x1 < y) x1 = y; u1 = log(x1)/log(y); u2 = log(x2)/log(y); /* If no prime factors <= sqrt(x2), numbers must be a primes > y */ if (x2 <= y*y) return (Li(x2) - Li(x1)); if (u2 <= 3) { double r, abserr; size_t neval; gsl_function f; f.function = (double (*) (double, void *)) &integrand1; f.params = &y; /* intnum(v=1,u,buchstab(v)*y^v) */ /* First part: intnum(v=u1, u, y^v/v*log(v-1.)) */ gsl_integration_qng (&f, MAX(u1, 2.) , u2, 0., 0.001, &r, &abserr, &neval); /* Second part: intnum(v=u1, u2, y^v/v) = Li(x2) - Li(x1) */ r += Li (x2) - Li (x1); return r; } { double r, abserr; size_t neval; gsl_function f; f.function = (double (*) (double, void *)) &integrand2; f.params = &y; gsl_integration_qng (&f, u1, u2, 0., 0.001, &r, &abserr, &neval); return r; } } static double integrand3 (double p, double *param) { const double x1 = param[0]; const double x2 = param[1]; const double y = param[2]; return no_small_prime (x1 / p, x2 / p, y) / log(p); } double no_small_prime_factor (const double x1, const double x2, const double y, const double z1, const double z2) { double r, abserr, param[3]; size_t neval; gsl_function f; param[0] = x1; param[1] = x2; param[2] = y; f.function = (double (*) (double, void *)) &integrand3; f.params = ¶m; gsl_integration_qng (&f, z1, z2, 0., 0.01, &r, &abserr, &neval); return r; } #endif #ifdef TESTDRIVE int main (int argc, char **argv) { double B1, B2, N, nr, r, m; int S; unsigned long p, i, pi; primegen pg[1]; primegen_init (pg); i = pi = 0; for (p = primegen_next (pg); p <= PRIME_PI_MAX; p = primegen_next (pg)) { for ( ; i < p; i++) prime_pi[PRIME_PI_MAP(i)] = pi; pi++; } for ( ; i < p; i++) prime_pi[PRIME_PI_MAP(i)] = pi; if (argc < 2) { printf ("Usage: rho [ ]\n"); return 1; } if (strcmp (argv[1], "-Buchstab_Phi") == 0) { unsigned long x, y, r; if (argc < 4) { printf ("-Buchstab_Phi needs x and y paramters\n"); exit (EXIT_FAILURE); } x = strtoul (argv[2], NULL, 10); y = strtoul (argv[3], NULL, 10); r = Buchstab_Phi (x, y); printf ("Buchstab_Phi (%lu, %lu) = %lu\n", x, y, r); exit (EXIT_SUCCESS); } else if (strcmp (argv[1], "-Buchstab_Psi") == 0) { unsigned long x, y, r; if (argc < 4) { printf ("-Buchstab_Psi needs x and y paramters\n"); exit (EXIT_FAILURE); } x = strtoul (argv[2], NULL, 10); y = strtoul (argv[3], NULL, 10); r = Buchstab_Psi (x, y); printf ("Buchstab_Psi (%lu, %lu) = %lu\n", x, y, r); exit (EXIT_SUCCESS); } else if (strcmp (argv[1], "-nsp") == 0) { double x1, x2, y, r; if (argc < 5) { printf ("-nsp needs x1, x2, and y paramters\n"); exit (EXIT_FAILURE); } x1 = atof (argv[2]); x2 = atof (argv[3]); y = atof (argv[4]); r = no_small_prime (x1, x2, y); printf ("no_small_prime(%f, %f, %f) = %f\n", x1, x2, y, r); exit (EXIT_SUCCESS); } else if (strcmp (argv[1], "-nspf") == 0) { double x1, x2, y, z1, z2, r; if (argc < 7) { printf ("-nspf needs x1, x2, y, z1, and z2 paramters\n"); exit (EXIT_FAILURE); } x1 = atof (argv[2]); x2 = atof (argv[3]); y = atof (argv[4]); z1 = atof (argv[5]); z2 = atof (argv[6]); r = no_small_prime_factor (x1, x2, y, z1, z2); printf ("no_small_prime(%f, %f, %f, %f, %f) = %f\n", x1, x2, y, z1, z2, r); exit (EXIT_SUCCESS); } if (argc < 6) { printf ("Need 5 or 7 arguments: B1 B2 N nr S [r m]\n"); exit (EXIT_FAILURE); } B1 = atof (argv[1]); B2 = atof (argv[2]); N = atof (argv[3]); nr = atof (argv[4]); S = atoi (argv[5]); r = 0; m = 1; if (argc > 7) { r = atoi (argv[6]); m = atoi (argv[7]); } rhoinit (256, 10); if (N < 50.) { double sum; sum = ecmprob(B1, B2, exp2 (N), nr, S); sum += 4. * ecmprob(B1, B2, 3./2. * exp2 (N), nr, S); sum += ecmprob(B1, B2, 2. * exp2 (N), nr, S); sum *= 1./6.; printf ("ECM: %.16f\n", sum); sum = pm1prob_rm (B1, B2, exp2 (N), nr, S, r, m); sum += 4. * pm1prob_rm (B1, B2, 3./2. * exp2 (N), nr, S, r, m); sum += pm1prob_rm (B1, B2, 2. * exp2 (N), nr, S, r, m); sum *= 1./6.; printf ("P-1: %.16f\n", sum); } else { printf ("ECM: %.16f\n", ecmprob(B1, B2, N, nr, S)); printf ("P-1: %.16f\n", pm1prob_rm (B1, B2, N, nr, S, r, m)); } rhoinit (0, 0); return 0; } #endif ecm-6.4.4/ecm-params.h.default0000644023561000001540000000071212106741273013037 00000000000000#define MPZMOD_THRESHOLD 170 #define REDC_THRESHOLD 294 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 1, 1, 8, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 16, 1, 1, 16, 16, 1, 1, 16, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define MUL_NTT_THRESHOLD 1024 #define PREREVERTDIVISION_NTT_THRESHOLD 64 #define POLYINVERT_NTT_THRESHOLD 512 #define POLYEVALT_NTT_THRESHOLD 512 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/mul_fft.c0000644023561000001540000022621712113414351011020 00000000000000/* An implementation in GMP of Scho"nhage's fast multiplication algorithm modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998. Revised July 2002 and January 2003, Paul Zimmermann. Further revised by Pierrick Gaudry, Paul Zimmermann, and Torbjorn Granlund, March/April and November/December 2006, and also by Alexander Kruppa in December 2006. Revised December 2007 for inclusion into GMP-ECM. THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* References: Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker Strassen, Computing 7, p. 281-292, 1971. Asymptotically fast algorithms for the numerical multiplication and division of polynomials with complex coefficients, by Arnold Scho"nhage, Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. Tapes versus Pointers, a study in implementing fast algorithms, by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986. See also http://www.loria.fr/~zimmerma/bignum Future: It might be possible to avoid a small number of MPN_COPYs by using a rotating temporary or two. Multiplications of unequal sized operands can be done with this code, but it needs a tighter test for identifying squaring (same sizes as well as same pointers). */ /* Throughout this file, Mp is chosen so that ord_{2^Nprime + 1}(sqrt(2)^Mp) == 2^k */ #include "config.h" #include #include /* for abort() */ #include /* for LONG_MAX */ #include #ifdef HAVE_ALLOCA_H #include #endif #ifdef HAVE_MALLOC_H #include #endif #include "gmp.h" #include "mul_fft-params.h" /* All functions that are not declared static are renamed to avoid conflicts with GMP's functions. Should we include ecm-impl.h instead? */ #define mpn_mul_fft __ecm_mpn_mul_fft #define mpn_mul_fft_full __ecm_mpn_mul_fft_full #define mpn_fft_best_k __ecm_mpn_fft_best_k #define mpn_fft_next_size __ecm_mpn_fft_next_size #ifndef MUL_FFT_TABLE2 #define MUL_FFT_TABLE2 {{1, 4}, {897, 5}, {2305, 6}, {4865, 7}, {11777, 8}, {31745, 9}, {98305, 10}, {1040385, 11}, {LONG_MAX, 0}} #endif #ifndef MUL_FFTM_TABLE2 #define MUL_FFTM_TABLE2 {{1, 4}, {833, 5}, {2049, 6}, {4609, 7}, {9217, 8}, {23553, 9}, {63489, 10}, {196609, 11}, {778241, 12}, {1032193, 13}, {LONG_MAX, 0}} #endif #ifndef SQR_FFT_TABLE2 #define SQR_FFT_TABLE2 MUL_FFT_TABLE2 #endif #ifndef SQR_FFTM_TABLE2 #define SQR_FFTM_TABLE2 MUL_FFTM_TABLE2 #endif #ifndef MUL_FFT_MODF_THRESHOLD #define MUL_FFT_MODF_THRESHOLD 300 #endif #ifndef SQR_FFT_MODF_THRESHOLD #define SQR_FFT_MODF_THRESHOLD 568 #endif #ifndef ASSERT #ifdef WANT_ASSERT #define ASSERT(x) assert(x) #else #define ASSERT(x) #endif #endif #ifndef ASSERT_ALWAYS #define ASSERT_ALWAYS(x) assert(x) #endif #ifndef LIKELY #if defined(__GNUC__) #define LIKELY(x) __builtin_expect ((x) != 0, 1) #else #define LIKELY(x) x #endif #endif /* _PROTO macro is copied from longlong.h of GMP */ #ifndef _PROTO #if (__STDC__-0) || defined (__cplusplus) || defined( _MSC_VER ) #define _PROTO(x) x #else #define _PROTO(x) () #endif #endif #ifndef MP_LIMB_T_MAX #define MP_LIMB_T_MAX (~(mp_limb_t)0) #endif #ifndef GMP_LIMB_HIGHBIT #define GMP_LIMB_HIGHBIT (MP_LIMB_T_MAX ^ (MP_LIMB_T_MAX >> 1)) #endif #ifndef TMP_DECL #define TMP_DECL #endif #ifndef TMP_MARK #define TMP_MARK #endif #ifndef TMP_FREE #define TMP_FREE #endif #ifndef TMP_ALLOC_LIMBS #define TMP_ALLOC_LIMBS(n) alloca((n) * sizeof(mp_limb_t)) #endif #ifndef TMP_ALLOC_MP_PTRS #define TMP_ALLOC_MP_PTRS(n) alloca((n) * sizeof(mp_ptr)) #endif #ifndef TMP_ALLOC_TYPE #define TMP_ALLOC_TYPE(n,t) alloca((n) * sizeof(t)) #endif #ifndef __GMP_ALLOCATE_FUNC_LIMBS #define __GMP_ALLOCATE_FUNC_LIMBS(n) malloc((n) * sizeof(mp_limb_t)) #endif #ifndef __GMP_FREE_FUNC_LIMBS #define __GMP_FREE_FUNC_LIMBS(a,n) free(a) #endif #if !defined(__GNUC__) #define __builtin_constant_p(x) 0 #endif #ifndef MPN_ZERO /* from gmp-impl.h */ #define MPN_ZERO(dst, n) \ do { \ ASSERT ((n) >= 0); \ if ((n) != 0) \ { \ mp_ptr __dst = (dst); \ mp_size_t __n = (n); \ do \ *__dst++ = 0; \ while (--__n); \ } \ } while (0) #endif #ifndef MPN_DECR_U /* copied from gmp-4.2.1/gmp-impl.h */ #define MPN_DECR_U(p,size,incr) \ do { \ mp_limb_t __x; \ mp_ptr __p = (p); \ if (__builtin_constant_p (incr) && (incr) == 1) \ { \ while ((*(__p++))-- == 0) \ ; \ } \ else \ { \ __x = *__p; \ *__p = __x - (incr); \ if (__x < (incr)) \ while ((*(++__p))-- == 0) \ ; \ } \ } while (0) #endif #ifndef mpn_incr_u /* copied from gmp-4.2.1/gmp-impl.h */ #define mpn_incr_u(p,incr) \ do { \ mp_limb_t __x; \ mp_ptr __p = (p); \ if (__builtin_constant_p (incr) && (incr) == 1) \ { \ while (++(*(__p++)) == 0) \ ; \ } \ else \ { \ __x = *__p + (incr); \ *__p = __x; \ if (__x < (incr)) \ while (++(*(++__p)) == 0) \ ; \ } \ } while (0) #endif #ifndef MPN_INCR_U /* copied from gmp-4.2.1/gmp-impl.h */ #define MPN_INCR_U(ptr, size, n) mpn_incr_u (ptr, n) #endif #ifndef MPN_COPY /* copied from gmp-4.2.1/gmp-impl.h */ #define MPN_COPY(dst, src, n) \ do { \ if ((n) != 0) \ { \ mp_size_t __n = (n) - 1; \ mp_ptr __dst = (dst); \ mp_srcptr __src = (src); \ mp_limb_t __x; \ ASSERT ((n) > 0); \ __x = *__src++; \ if (__n != 0) \ { \ do \ { \ *__dst++ = __x; \ __x = *__src++; \ } \ while (--__n); \ } \ *__dst++ = __x; \ } \ } while (0) #endif #ifndef mpn_com_n /* copied from gmp-4.2.1/gmp-impl.h */ #define mpn_com_n(d,s,n) \ do { \ mp_ptr __d = (d); \ mp_srcptr __s = (s); \ mp_size_t __n = (n); \ ASSERT (__n >= 1); \ do \ *__d++ = (~ *__s++) & GMP_NUMB_MASK; \ while (--__n); \ } while (0) #endif #ifndef mpn_sqr_n #define mpn_sqr_n(a,b,n) mpn_mul_n(a,b,b,n) #endif /* Uncomment this define to disable to use of sqrt(2) as a root of unity for the transform/weight signal. The function mpn_fft_mul_sqrt2exp_modF() will still get called, but parameters for the transform will be chosen so that it will always be called with an even exponent, thus the multiplication will be by a power of 2. */ /* #define NO_SQRT_2 */ /* Change this to "#define TRACE(x) x" for some traces. */ #define TRACE(x) /* #define COUNT_ZEROCOPY */ /* This define enables interleaved decomposition/forward transform in Bailey's algorithm for better data locality */ #define MERGED_BAILEY_DECOMPOSE /* The MPN_ZERO and MPN_COPY macros are pretty slow in GMP 4.2 (and presumably previous versions) so we try to define quicker functions here. For now we simply use the string store/copy instruction which is ok, although not optimal (MMX or XMM would probably do better). */ #define OWN_MPN_FFT_ZERO /* REP MOVSL/MOVSQ seems to be no faster or slower than MPN_COPY() */ /* #define OWN_MPN_FFT_COPY */ #if defined(__x86_64__) && defined(__GNUC__) && defined(OWN_MPN_FFT_ZERO) static inline void MPN_FFT_ZERO (mp_ptr dst, mp_size_t n) { __asm__ __volatile__ ("rep stosq": "+c" (n), "+D" (dst): "a" (0L) : "memory"); /* Put n in %rcx, which will also be written (decreased to 0) by the instruction and put dst in %rdi which will also be written (increased by 8*n). Put 0 in %rax. */ } #elif defined(__i386__) && defined(__GNUC__) && defined(OWN_MPN_FFT_ZERO) static inline void MPN_FFT_ZERO (mp_ptr dst, mp_size_t n) { __asm__ __volatile__ ("rep stosl" : "+c" (n), "+D" (dst) : "a" (0) : "memory"); } #elif defined(_MSC_VER) && !defined(_WIN64) static inline void MPN_FFT_ZERO (mp_ptr dst, mp_size_t n) { ASSERT(n >= 0); __asm { push edi mov edi,dst xor eax,eax mov ecx,n rep stosd pop edi } } #else /* Fall back to GMP's MPN_ZERO() macro */ #define MPN_FFT_ZERO(dst, n) MPN_ZERO(dst,n) #endif #if defined(__x86_64__) && defined(__GNUC__) && defined(OWN_MPN_FFT_ZERO) static inline void MPN_FFT_STORE (mp_ptr dst, mp_size_t n, mp_limb_t d) { __asm__ __volatile__ ("rep stosq": "+c" (n), "+D" (dst): "a" (d) : "memory"); /* Put n in %rcx, which will also be written (decreased to 0) by the instruction and put dst in %rdi which will also be written (increased by 8*n). Put 0 in %rax. */ } #elif defined(__i386__) && defined(__GNUC__) && defined(OWN_MPN_FFT_ZERO) static inline void MPN_FFT_STORE (mp_ptr dst, mp_size_t n, mp_limb_t d) { __asm__ __volatile__ ("rep stosl" : "+c" (n), "+D" (dst) : "a" (d) : "memory"); } #elif defined(_MSC_VER) && !defined(_WIN64) static inline void MPN_FFT_STORE (mp_ptr dst, mp_size_t n, mp_limb_t d) { ASSERT(n >= 0); __asm { push edi mov edi,dst mov eax,d mov ecx,n rep stosd pop edi } } #else static inline void MPN_FFT_STORE (mp_ptr dst, mp_size_t n, mp_limb_t d) { ASSERT(n >= 0); for (; n > 0; n--) *dst++ = d; } #endif #if defined(__x86_64__) && defined(__GNUC__) && defined(OWN_MPN_FFT_COPY) static inline void MPN_FFT_COPY (mp_ptr dst, const mp_srcptr src, mp_size_t n) { __asm__ __volatile__ ("rep movsq": "+c" (n), "+S" (src), "+D" (dst) : "memory"); /* Put n in %rcx, which will also be written (decreased to 0) by the instruction, put src in %rsi and put dst in %rdi which will both also be written (each increased by 8*n). FIXME: should "memory" go in the clobbered list? */ } #elif defined(__i386__) && defined(__GNUC__) && defined(OWN_MPN_FFT_COPY) static inline void MPN_FFT_COPY (mp_ptr dst, const mp_srcptr src, mp_size_t n) { __asm__ __volatile__ ("rep movsl" : "+c" (n), "+S" (src), "+D" (dst) : "memory"); } #elif defined(_MSC_VER) && !defined(_WIN64) static inline void MPN_FFT_COPY (mp_ptr dst, const mp_srcptr src, mp_size_t n) { __asm { push esi push edi mov edi,dst mov esi,src mov ecx,n rep movsd pop edi pop esi } } #else /* Fall back to GMP's MPN_COPY() macro */ #define MPN_FFT_COPY(dst, src, n) MPN_COPY(dst,src,n) #endif /* If LOG2_GMP_NUMB_BITS is defined, GMP_NUMB_BITS=2^LOG2_GMP_NUMB_BITS; this enables to speed up multiplication or division by GMP_NUMB_BITS. */ #if (GMP_NUMB_BITS == 32) #define LOG2_GMP_NUMB_BITS 5 #elif (GMP_NUMB_BITS == 64) #define LOG2_GMP_NUMB_BITS 6 #endif static inline unsigned int mpn_mul_fft_lcm (unsigned int, unsigned int); /* quotient, remainder, product by GMP_NUMB_BITS */ #ifdef LOG2_GMP_NUMB_BITS #define MOD_GMP_NUMB_BITS(x) ((x) & ((1 << LOG2_GMP_NUMB_BITS) - 1)) #define DIV_GMP_NUMB_BITS(x) ((x) >> LOG2_GMP_NUMB_BITS) /* x <- y / (2 * GMP_NUMB_BITS), y <- y % (2 * GMP_NUMB_BITS) */ #define DIVMOD_2GMP_NUMB_BITS(x,y) \ x = (y) >> (LOG2_GMP_NUMB_BITS + 1); \ y = (y) & ((1 << (LOG2_GMP_NUMB_BITS + 1)) - 1) #define MUL_GMP_NUMB_BITS(x) ((x) << LOG2_GMP_NUMB_BITS) #define MUL_2GMP_NUMB_BITS(x) ((x) << (LOG2_GMP_NUMB_BITS + 1)) #define MUL_4GMP_NUMB_BITS(x) ((x) << (LOG2_GMP_NUMB_BITS + 2)) #define LCM_GMP_NUMB_BITS(x) (((x) > LOG2_GMP_NUMB_BITS) ? (1<<(x)) : GMP_NUMB_BITS) #else #define MOD_GMP_NUMB_BITS(x) ((x) % GMP_NUMB_BITS) #define DIV_GMP_NUMB_BITS(x) ((x) / GMP_NUMB_BITS) #define DIVMOD_2GMP_NUMB_BITS(x,y) \ x = (y) / (2 * GMP_NUMB_BITS); \ y = (y) - (x) * (2 * GMP_NUMB_BITS) #define MUL_GMP_NUMB_BITS(x) ((x) * GMP_NUMB_BITS) #define MUL_2GMP_NUMB_BITS(x) ((x) * (2 * GMP_NUMB_BITS)) #define MUL_4GMP_NUMB_BITS(x) ((x) * (4 * GMP_NUMB_BITS)) /* lcm(GMP_NUMB_BITS, 2^x) */ #define LCM_GMP_NUMB_BITS(x) mpn_mul_fft_lcm (GMP_NUMB_BITS, x) #endif #define ONE ((mp_limb_t) 1) static int mpn_mul_fft_internal _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, int, mp_ptr *, mp_ptr *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, int **, mp_ptr, int, int)); /* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n. sqr==0 if for a multiply, sqr==1 for a square. Don't declare it static since it is needed by tuneup. */ #define MPN_FFT_TABLE2_SIZE 256 struct nk { mp_size_t n; unsigned char k; }; static struct nk mpn_fft_table2[4][MPN_FFT_TABLE2_SIZE] = { MUL_FFT_TABLE2, SQR_FFT_TABLE2, MUL_FFTM_TABLE2, SQR_FFTM_TABLE2 }; /* sqr_b = 0: plain multiplication mod 2^N+1 sqr_b = 1: square mod 2^N+1 sqr_b = 2: plain multiplication mod 2^N-1 sqr_b = 3: square mod 2^N-1 */ int mpn_fft_best_k (mp_size_t n, int sqr_b) { struct nk *tab; int last_k; last_k = 4; for (tab = mpn_fft_table2[sqr_b] + 1; ; tab++) { if (n < tab->n) break; last_k = tab->k; } return last_k; } #ifdef MUL_FFT_FULL_TABLE2 #define MPN_FFT_FULL_TABLE2_SIZE 256 #ifndef SQR_FFT_FULL_TABLE2 #define SQR_FFT_FULL_TABLE2 MUL_FFT_FULL_TABLE2 #endif static struct nk mpn_fft_full_table2[4][MPN_FFT_FULL_TABLE2_SIZE] = { MUL_FFT_FULL_TABLE2, SQR_FFT_FULL_TABLE2 }; static int mpn_fft_best_a (mp_size_t pl, int sqr) { struct nk *tab; int last_a; last_a = 1; for (tab = mpn_fft_full_table2[sqr] + 1; ; tab++) { if (pl < tab->n) break; last_a = tab->k; } return last_a; } #endif /* MUL_FFT_FULL_TABLE2 */ /* Returns smallest possible number of limbs >= pl for a fft of size 2^k, i.e. smallest multiple of 2^k >= pl. Don't declare static: needed by tuneup. */ mp_size_t mpn_fft_next_size (mp_size_t pl, int k) { pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */ return pl << k; } /* Initialize l[i][j] with bitrev(j) */ static void mpn_fft_initl (int **l, int k) { int i, j, K; int *li; l[0][0] = 0; for (i = 1, K = 1; i <= k; i++, K *= 2) { li = l[i]; for (j = 0; j < K; j++) { li[j] = 2 * l[i - 1][j]; li[K + j] = 1 + li[j]; } } } #ifndef HAVE_NATIVE_mpn_lshiftc /* Shift {up, n} cnt bits to the left, store the complemented result in {rp, n}, and output the shifted bits (not complemented). Same as: cc = mpn_lshift (rp, up, n, cnt); mpn_com_n (rp, rp, n); return cc; Assumes n >= 1 and 1 <= cnt < GMP_NUMB_BITS. {rp, n} and {up, n} may overlap, provided rp >= up (like mpn_lshift). */ static mp_limb_t mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) { mp_limb_t high_limb, low_limb; unsigned int tnc; mp_size_t i; mp_limb_t retval; ASSERT(n >= 1); ASSERT(1 <= cnt && cnt < GMP_NUMB_BITS); up += n; rp += n; tnc = GMP_NUMB_BITS - cnt; low_limb = *--up; retval = low_limb >> tnc; high_limb = (low_limb << cnt); for (i = n - 1; i != 0; i--) { low_limb = *--up; *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK; high_limb = low_limb << cnt; } *--rp = (~high_limb) & GMP_NUMB_MASK; return retval; } #endif /* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1, by subtracting that modulus if necessary. If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a borrow and the limbs must be zeroed out again. This will occur very infrequently. */ static inline void mpn_fft_normalize (mp_ptr ap, mp_size_t n) { if (ap[n] != 0) { MPN_DECR_U (ap, n + 1, ONE); if (ap[n] == 0) { /* This happens with very low probability; we have yet to trigger it, and thereby make sure this code is correct. */ MPN_FFT_ZERO (ap, n); ap[n] = 1; } else ap[n] = 0; } } /* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1} Assumes a is semi-normalized, i.e. a[n] <= 1. r and a must have n+1 limbs, and not overlap. */ static void mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n) { int sh, negate; mp_limb_t cc, rd; ASSERT(d < 2 * (unsigned int) n * GMP_NUMB_BITS); sh = MOD_GMP_NUMB_BITS(d); d = DIV_GMP_NUMB_BITS(d); negate = d >= (unsigned int) n; if (negate) { d -= n; /* r[0..d-1] <-- lshift(a[n-d]..a[n-1], sh) r[d..n-1] <-- -lshift(a[0]..a[n-d-1], sh) */ if (sh != 0) { /* no out shift below since a[n] <= 1 */ mpn_lshift (r, a + n - d, d + 1, sh); rd = r[d]; cc = mpn_lshiftc (r + d, a, n - d, sh); } else { #ifdef COUNT_ZEROCOPY printf ("mpn_fft_mul_2exp_modF: MPN_FFT_COPY 1 with %d limbs\n", d); #endif /* Executed 256 times for 1000000 limb mpn_mul_n, each d in [0, 255] appears exactly once */ MPN_COPY (r, a + n - d, d); rd = a[n]; mpn_com_n (r + d, a, n - d); cc = 0; } /* add cc to r[0], and add rd to r[d] */ /* now add 1 in r[d], subtract 1 in r[n], i.e. add 1 in r[0] */ r[n] = 0; /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */ mpn_incr_u (r, cc + 1); rd ++; /* rd might overflow when sh=GMP_NUMB_BITS-1 */ cc = (rd == 0) ? ONE : rd; r = r + d + (rd == 0); mpn_incr_u (r, cc); return; } /* if negate=0, r[0..d-1] <-- -lshift(a[n-d]..a[n-1], sh) r[d..n-1] <-- lshift(a[0]..a[n-d-1], sh) */ if (sh != 0) { /* no out bits below since a[n] <= 1 */ mpn_lshiftc (r, a + n - d, d + 1, sh); rd = ~r[d]; /* {r, d+1} = {a+n-d, d+1} << sh */ cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<= (unsigned int) n) /* (a-b)*B^d = (b-a)*B^(d-n) */ { mp_srcptr t; t = a; a = b; b = t; d -= n; } if (d == 0) mpn_fft_sub_modF (r, a, b, n); else { mp_limb_t cc; /* let a = ah * B^(n-d) + al and b = bh * B^(n-d) + bl, where ah, bh have d limbs + 1 bit, and al, bl have n-d limbs. Then (a-b)*B^d = (al-bl) * B^d + (bh-ah). */ ASSERT (0 < d && d < (unsigned int) n); cc = mpn_sub_n (r, b + n - d, a + n - d, d); /* bh-ah */ #ifdef HAVE_NATIVE_mpn_sub_nc cc = mpn_sub_nc (r + d, a, b, n - d, cc); /* al-bl */ #else cc = mpn_sub_n (r + d, a, b, n - d) + mpn_sub_1 (r + d, r + d, n - d, cc); #endif /* 0 <= cc <= 1 */ if (a[n] > b[n]) cc += mpn_sub_1 (r + d, r + d, n - d, a[n] - b[n]); /* 0 <= cc <= 2 */ else cc -= mpn_add_1 (r + d, r + d, n - d, b[n] - a[n]); /* -1 <= cc <= 1 */ /* -1 <= cc <= 2 */ /* cc is the borrow at r[n], which must be added at r[0] */ r[n] = cc >> (GMP_NUMB_BITS - 1); MPN_INCR_U (r, n + 1, cc + r[n]); } } #ifdef _MSC_VER /* optimisation bug on VC++ v9 */ # pragma optimize( "", on ) #endif /* r <- a*sqrt(2)^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1} Assumes a is semi-normalized, i.e. a[n] <= 1. Assumes 0 < d < 4*n*GMP_NUMB_BITS. r and a must have n+1 limbs, and not overlap. Calls mpn_fft_mul_2exp_modF() and mpn_fft_sub_modF(). */ static void mpn_fft_mul_sqrt2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n) { unsigned int e = d >> 1; unsigned int N = MUL_GMP_NUMB_BITS(n); /* n * GMP_NUMB_BITS */ mp_ptr tmp; mp_srcptr b; mp_limb_t ca, cc; mp_size_t l; TMP_DECL; ASSERT(0 < d && d < 4 * N); ASSERT(a != r); #ifdef NO_SQRT_2 ASSERT_ALWAYS(d % 2 == 0); #endif /* If d is even, we have a regular multiplication by a power of 2 */ if ((d & 1) == 0) { /* since d cannot be zero, e cannot be zero too */ if (e < GMP_NUMB_BITS) mpn_fft_mul_2exp_modFa (r, a, e, n); else mpn_fft_mul_2exp_modF (r, a, e, n); return; } ASSERT(N % 4 == 0); /* Multiply by sqrt(2) * 2^e = (2^(3N/4) - 2^(N/4)) * 2^e = 2^(3N/4 + e) - 2^(N/4 + e) */ e += 3 * (N >> 2); /* 3N/4 <= e < 11N/4 */ if (e >= 2 * N) e -= 2 * N; /* 0 <= e < 2N */ TMP_MARK; tmp = TMP_ALLOC_LIMBS(n + 1); ASSERT(tmp != NULL); /* the following variant avoids the -H-L computation, which requires a call to mpn_com_n(). */ if (e != 0) { mpn_fft_mul_2exp_modF (r, a, e, n); /* a*2^(e+N/2) */ b = r; } else b = a; l = n >> 1; if ((n & 1) != 0) { mpn_lshift (tmp, b, n + 1, GMP_NUMB_BITS >> 1); } else MPN_COPY (tmp + n - l, b + n - l, l + 1); /* we still have to shift {tmp, n+1} by l limbs to the left: let tl = {tmp, n-l} and th = {tmp+n-l,l+1} */ /* rh <- bh + tl, rl <- bl - th */ ca = b[n] + mpn_add_n (r + l, b + l, (n & 1) ? tmp : b, n - l); cc = tmp[n] + mpn_sub_n (r, b, tmp + n - l, l); cc = mpn_sub_1 (r + l, r + l, n - l, cc); /* We must subtract 0 <= ca <= 2 and add 0 <= cc <= 1 at r[0]. If cc >= ca: r[n]=0 and add cc - ca. If cc < ca: r[n]=1 and subtract ca-cc-1. */ r[n] = cc < ca; if (cc >= ca) MPN_INCR_U (r, n + 1, cc - ca); else /* cc < ca */ MPN_DECR_U (r, n + 1, ca - ONE - cc); TMP_FREE; } /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+b and store in tmp. tmp must have space for Kl + 1 limbs */ static void mpn_mul_fft_reduce (mp_ptr tmp, /* mp_srcptr A, */ mp_srcptr n, mp_size_t nl, mp_size_t Kl, /* int l, */ int b) { mp_size_t dif = nl - Kl; mp_limb_signed_t cy; if (dif > Kl) { int subp = 0; cy = ((b == 1) ? mpn_sub_n : mpn_add_n) (tmp, n, n + Kl, Kl); n += 2 * Kl; dif -= Kl; /* now dif > 0 */ while (dif > Kl) { if (b == -1) cy += mpn_add_n (tmp, tmp, n, Kl); else if (subp) cy += mpn_sub_n (tmp, tmp, n, Kl); else cy -= mpn_add_n (tmp, tmp, n, Kl); subp ^= 1; n += Kl; dif -= Kl; } /* now dif <= Kl */ if (b == -1) cy += mpn_add (tmp, tmp, Kl, n, dif); else if (subp) cy += mpn_sub (tmp, tmp, Kl, n, dif); else cy -= mpn_add (tmp, tmp, Kl, n, dif); if (cy >= 0) cy = mpn_add_1 (tmp, tmp, Kl, cy); else cy = mpn_sub_1 (tmp, tmp, Kl, -cy); } else /* dif <= Kl, i.e. nl <= 2 * Kl */ { cy = ((b == 1) ? mpn_sub : mpn_add) (tmp, n, Kl, n + Kl, dif); cy = mpn_add_1 (tmp, tmp, Kl, cy); } tmp[Kl] = cy; } /* Store in {A+(nprime + 1) * offset, nprime+1} the first l limbs (with zero padding) from {n + l*offset, ...}, and in {A + (nprime+1)*(offset + 1< Kl + 1) { /* FIXME: We really don't want to do this multiple times if stride > 0 ! */ TRACE(printf ("mpn_mul_fft_decompose: This takes too long!\n");) tmp = TMP_ALLOC_LIMBS(Kl + 1); ASSERT(tmp != NULL); mpn_mul_fft_reduce (tmp, /* A, */ n, nl, Kl, /* l, */ b); n = tmp; nl = Kl + 1; } A += (nprime + 1) * offset; n += l * offset; nl -= (l * offset < nl) ? l * offset : nl; /* for b=1, since we use {T, nprime+1} as temporary array below, and only the first l limbs may be non-zero, except for the last part, we can set {T+l, nprime+1-l} to zero now. */ MPN_FFT_ZERO (T + l, nprime + 1 - l); for (i = offset; i < K; i += 1 << stride) { Ap[i] = A; /* store the next l limbs of n into A[0..nprime] */ /* nl == 0 => j == 0, nl unchanged */ j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */ nl -= j; nl -= (nl > (l << stride) - l) ? (l << stride) - l : nl; if (b == 1 && i != 0 && j > 0) { /* add weight signal for negacyclic convolution. We need a root of unity here whose order is twice the transform length K. Since ord(sqrt(2)^Mp) = K, sqrt(2)^(Mp/2) will do, so long as Mp is even. */ #define FORCE_EXPENSIVE_DECOMPOSE 0 if ((FORCE_EXPENSIVE_DECOMPOSE) || (i & (Mp / 2) & 1)) { #ifdef COUNT_ZEROCOPY printf ("mpn_mul_fft_decompose: MPN_FFT_COPY 1 with %d limbs\n", j); #endif MPN_FFT_COPY (T, n, j); ASSERT_ALWAYS (j <= l + 1); if (j < l) MPN_FFT_ZERO (T + j, l - j); mpn_fft_mul_sqrt2exp_modF (A, T, i * (Mp / 2), nprime); } else { /* i * Mp / 2 is even, so weight signal is sqrt(2)^(i * Mp / 2) = 2^(i * Mp / 4). Shift directly into A. */ const int c = (i * Mp) / 4; const int d = c % GMP_NUMB_BITS; const int e = c / GMP_NUMB_BITS; #undef DECOMPOSE_CAREFUL_CHECK #ifdef DECOMPOSE_CAREFUL_CHECK /* Do it the expensive way and store result in T for comparison */ MPN_FFT_COPY (T, n, j); ASSERT_ALWAYS (j <= l + 1); if (j < l) MPN_FFT_ZERO (T + j, l - j); mpn_fft_mul_2exp_modF (A, T, c, nprime); MPN_COPY (T, A, nprime + 1); #endif /* Copy data from n to A+e, shifted by d bits. */ if (e + j < nprime || (e + j == nprime && d <= 1)) { /* The shifted data fits without wrapping */ MPN_FFT_ZERO (A, e); if (d == 0) { MPN_COPY(A + e, n, j); MPN_FFT_ZERO (A + e + j, nprime + 1 - e - j); } else { A[e + j] = mpn_lshift (A + e, n, j, d); /* Now zero A[e + j + 1 ... nprime] */ MPN_FFT_ZERO (A + e + j + 1, nprime - e - j); } } else { const int of = j + e - nprime; if (d == 0) { /* Here, e + j > nprime, i.e. there is wrapping but d == 0, so no bit shifting */ mp_limb_t cc; ASSERT(e + j > nprime); /* Hence of > 0 */ /* Store ~(N_hi) to A[0 ... of[ */ mpn_com_n (A, n + nprime - e, of); cc = mpn_add_1 (A, A, of, ONE); MPN_FFT_STORE (A + of, nprime - j, cc - ONE); /* Store n_lo * w^e */ ASSERT(nprime - e > 0); cc = mpn_sub_1 (A + e, n, nprime - e, ONE - cc); A[nprime] = 0; MPN_INCR_U (A, nprime + 1, cc); } else { /* Here, e + j >= nprime and d != 0 */ mp_limb_t cc; /* We want n*2^i with i < nprime*w, i > (nprime-j)*w, Store nprime-e words, shifted left by d, at A+e. */ cc = mpn_lshift (A + e, n, nprime - e, d); A[nprime] = 0; if (of > 0) { /* Store a_hi to A[0 ... of] */ A[of] = mpn_lshift (A, n + nprime - e, of, d); A[0] |= cc; /* And do binary negate */ mpn_com_n (A, A, of + 1); cc = mpn_add_1 (A, A, of + 1, ONE); } else { A[0] = -cc; cc = (cc == 0); } /* Store cc-1 to A[of+1 ... e[ */ MPN_FFT_STORE (A + of + 1, nprime - j - 1, cc - ONE); cc = mpn_sub_1 (A + e, A + e, nprime - e, ONE - cc); MPN_INCR_U (A, nprime + 1, cc); } } #ifdef DECOMPOSE_CAREFUL_CHECK ASSERT(A[nprime] <= 1); if (A[nprime] == 1) { /* Fully normalize for the sake of the following comparison */ mp_limb_t cc; cc = mpn_sub_1 (A, A, nprime, 1); A[nprime] = 0; mpn_add_1 (A, A, nprime + 1, cc); } if (mpn_cmp (A, T, nprime + 1) != 0) { printf ("nprime = %d, i = %d, j = %d, d = %d, " "e = %d\n", nprime, i, j, d, e); for (i = 0; i < nprime + 1; i++) printf ("%d: %lx %lx %c\n", i, A[i], T[i], (A[i] != T[i]) ? '!' : ' '); abort (); } MPN_ZERO (T, nprime + 1); #endif } } else /* b = -1 or i == 0 or j == 0. No weight to be added here. */ { #ifdef COUNT_ZEROCOPY printf ("mpn_mul_fft_decompose: MPN_FFT_COPY 2 with %d limbs\n", j); #endif MPN_COPY (A, n, j); MPN_FFT_ZERO (A + j, nprime + 1 - j); } ASSERT(A[nprime] <= 1); n += l << stride; A += (nprime + 1) << stride; } ASSERT_ALWAYS (nl == 0 || (nl == 1 && stride > 0 && offset == 0)); TMP_FREE; } /* A0 <- A0+A1 A1 <- (A0-A1)*2^e0 Butterfly using a rotating buffer instead of temp space. The variable rotbuf is a size-1 array of coefficients; this might be exchanged with one of the coefficients of A. */ static inline void mpn_fft_butterfly_rotbuf (mp_ptr *A, mp_size_t i0, mp_size_t i1, unsigned int e0, mp_ptr *rotbuf, mp_size_t n) { unsigned int d, e = e0; ASSERT(e0 != 0); DIVMOD_2GMP_NUMB_BITS(d, e); /* 0 <= d < 2*n, 0 <= e0 < 2*GMP_NUMB_BITS */ mpn_fft_lshsub_modF (rotbuf[0], A[i0], A[i1], d, n); mpn_fft_add_modF (A[i0], A[i0], A[i1], n); if (e != 0) mpn_fft_mul_sqrt2exp_modF (A[i1], rotbuf[0], e, n); else { mp_ptr tmp = rotbuf[0]; rotbuf[0] = A[i1]; A[i1] = tmp; } } static inline void mpn_fft_butterfly_rotbuf0 (mp_ptr *A, mp_size_t i0, mp_size_t i1, mp_ptr *rotbuf, mp_size_t n) { mp_ptr tmp; mpn_fft_sub_modF (rotbuf[0], A[i0], A[i1], n); mpn_fft_add_modF (A[i0], A[i0], A[i1], n); tmp = rotbuf[0]; rotbuf[0] = A[i1]; A[i1] = tmp; } /* In this version, the shift e0 is in [0..N], so we have to do one more test on e0. */ static inline void mpn_fft_butterfly_rotbufN (mp_ptr *A, mp_size_t i0, mp_size_t i1, unsigned int e0, mp_ptr *rotbuf, mp_size_t n) { mp_size_t N = MUL_4GMP_NUMB_BITS(n); /* 4 * n * GMP_NUMB_BITS */ unsigned int d; if (e0 >= (unsigned int) N) e0 -= N; DIVMOD_2GMP_NUMB_BITS (d,e0); /* 0 <= d < 2*n, 0 <= e0 < 2*GMP_NUMB_BITS */ mpn_fft_lshsub_modF (rotbuf[0], A[i0], A[i1], d, n); mpn_fft_add_modF (A[i0], A[i0], A[i1], n); if (e0 != 0) mpn_fft_mul_sqrt2exp_modF (A[i1], rotbuf[0], e0, n); else { mp_ptr tmp = rotbuf[0]; rotbuf[0] = A[i1]; A[i1] = tmp; } } /* Radix 4 transform. This uses a rotating buffer: the array Ap gets unsorted (but we usually don't care). */ static void mpn_fft_fft_radix4Rec (mp_ptr *Ap, mp_size_t ind_start, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mp_size_t i, stride, stride2; stride2 = 1<<(k-2); stride = 1<<(k-1); if (k == 1) { mpn_fft_butterfly_rotbuf0(Ap, ind_start, ind_start+1, rotbuf, n); return; } mpn_fft_butterfly_rotbuf0 (Ap, ind_start, ind_start+stride, rotbuf, n); mpn_fft_butterfly_rotbuf (Ap, ind_start+stride2, ind_start+stride+stride2, omega*stride2, rotbuf, n); mpn_fft_butterfly_rotbuf0 (Ap, ind_start+stride, ind_start+stride+stride2, rotbuf, n); mpn_fft_butterfly_rotbuf0 (Ap, ind_start, ind_start+stride2, rotbuf, n); for (i = 1; i < stride2; ++i) { mpn_fft_butterfly_rotbuf(Ap, ind_start+i, ind_start+i+stride, omega*i, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind_start+i+stride2, ind_start+i+stride+stride2, omega*(i+stride2), rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind_start+i+stride, ind_start+i+stride+stride2, omega*i*2, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind_start+i, ind_start+i+stride2, omega*i*2, rotbuf, n); } if (k == 3) { mpn_fft_butterfly_rotbuf0(Ap, ind_start+stride+stride2, ind_start+stride+stride2+1, rotbuf, n); mpn_fft_butterfly_rotbuf0(Ap, ind_start+stride, ind_start+stride+1, rotbuf, n); mpn_fft_butterfly_rotbuf0(Ap, ind_start, ind_start+1, rotbuf, n); mpn_fft_butterfly_rotbuf0(Ap, ind_start+stride2, ind_start+stride2+1, rotbuf, n); } if (k > 3) { mp_size_t omega4 = omega<<2; mpn_fft_fft_radix4Rec(Ap, ind_start, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4Rec(Ap, ind_start+stride2, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4Rec(Ap, ind_start+stride, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4Rec(Ap, ind_start+stride+stride2, k-2, omega4, n, rotbuf); } } static void mpn_fft_fft_radix4 (mp_ptr *Ap, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mpn_fft_fft_radix4Rec(Ap, 0, k, omega, n, rotbuf); } /* The "Neg" versions multiply by the *inverse* of the root. This is used for the backward transform. Propagating this bit of information saves the %N, since only at the end we do N-blah. FIXME: The Neg and non-Neg versions can probably be merged at almost no cost. */ static void mpn_fft_fft_radix4RecNeg (mp_ptr *Ap, mp_size_t ind_start, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mp_size_t i, stride, stride2; mp_size_t N = MUL_4GMP_NUMB_BITS(n); /* 4 * n * GMP_NUMB_BITS */ stride2 = 1 << (k - 2); stride = 1 << (k - 1); if (k == 1) { mpn_fft_butterfly_rotbufN(Ap, ind_start, ind_start+1, 0, rotbuf, n); return; } for (i = 0; i < stride2; ++i) { mpn_fft_butterfly_rotbufN(Ap, ind_start+i, ind_start+i+stride, N-omega*i, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start+i+stride2, ind_start+i+stride+stride2, N-omega*(i+stride2), rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start+i+stride, ind_start+i+stride+stride2, N-omega*i*2, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start+i, ind_start+i+stride2, N-omega*i*2, rotbuf, n); } if (k == 3) { mpn_fft_butterfly_rotbufN(Ap, ind_start+stride+stride2, ind_start+stride+stride2+1, 0, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start+stride, ind_start+stride+1, 0, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start, ind_start+1, 0, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind_start+stride2, ind_start+stride2+1, 0, rotbuf, n); } if (k > 3) { mp_size_t omega4 = omega<<2; mpn_fft_fft_radix4RecNeg(Ap, ind_start, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4RecNeg(Ap, ind_start+stride2, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4RecNeg(Ap, ind_start+stride, k-2, omega4, n, rotbuf); mpn_fft_fft_radix4RecNeg(Ap, ind_start+stride+stride2, k-2, omega4, n, rotbuf); } } static void mpn_fft_fft_radix4Neg (mp_ptr *Ap, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mpn_fft_fft_radix4RecNeg(Ap, 0, k, omega, n, rotbuf); } static void mpn_fft_fft_radix4Inv(mp_ptr *Ap, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf, int **ll) { int i; /* Bit-reverse table Ap. FIXME: these bit-rev copies might be avaoided. But do they really cost? */ for (i = 0; i < 1< 3) { mpn_fft_fftR4_twistedRec(Ap, ind, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); } return; } if (k == 1) { mpn_fft_butterfly_rotbuf (Ap, ind, ind + 1, omega * om_mult, rotbuf, n); return; } for (i = 0; i < stride2; ++i) { mp_size_t root = omega*(om_curr*i + om_mult); mpn_fft_butterfly_rotbuf(Ap, ind+i, ind+stride+i, root, rotbuf, n); root = omega*(om_curr*(i+stride2) + om_mult); mpn_fft_butterfly_rotbuf(Ap, ind+i+stride2, ind+stride+stride2+i, root, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind+i+stride, ind+stride+stride2+i, omega*(om_curr*i + om_mult)*2, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind+i, ind+stride2+i, omega*(om_curr*i + om_mult)*2, rotbuf, n); } if (k == 3) { mp_size_t root = omega*om_mult*4; mpn_fft_butterfly_rotbuf(Ap, ind+stride+stride2, ind+stride+stride2+1, root, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind+stride, ind+stride+1, root, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind, ind+1, root, rotbuf, n); mpn_fft_butterfly_rotbuf(Ap, ind+stride2, ind+stride2+1, root, rotbuf, n); } if (k > 3) { mpn_fft_fftR4_twistedRec(Ap, ind, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRec(Ap, ind+stride+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); } } static void mpn_fft_fftR4_twisted(mp_ptr * Ap, mp_size_t rk, mp_size_t k1, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mpn_fft_fftR4_twistedRec(Ap, 0, k1, omega, 1<<(k-k1), rk, n, rotbuf); } /* Neg version for reverse transform. (see comments above) */ static void mpn_fft_fftR4_twistedRecNeg(mp_ptr * Ap, mp_size_t ind, mp_size_t k, mp_size_t omega, mp_size_t om_curr, mp_size_t om_mult, mp_size_t n, mp_ptr *rotbuf) { mp_size_t stride = 1<<(k-1); mp_size_t stride2 = 1<<(k-2); int i; mp_size_t N = MUL_4GMP_NUMB_BITS(n); /* 4 * n * GMP_NUMB_BITS */ if (k == 0) return; if (k == 1) { mpn_fft_butterfly_rotbufN (Ap, ind, ind + 1, N - omega * om_mult, rotbuf, n); return; } for (i = 0; i < stride2; ++i) { mp_size_t root = omega*(om_curr*i + om_mult); mpn_fft_butterfly_rotbufN(Ap, ind+i, ind+stride+i, N-root, rotbuf, n); root = omega*(om_curr*(i+stride2) + om_mult); mpn_fft_butterfly_rotbufN(Ap, ind+i+stride2, ind+stride+stride2+i, N-root, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind+i+stride, ind+stride+stride2+i, N-omega*(om_curr*i + om_mult)*2, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind+i, ind+stride2+i, N-omega*(om_curr*i + om_mult)*2, rotbuf, n); } if (k == 3) { mp_size_t root = N-omega*om_mult*4; mpn_fft_butterfly_rotbufN(Ap, ind+stride+stride2, ind+stride+stride2+1, root, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind+stride, ind+stride+1, root, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind, ind+1, root, rotbuf, n); mpn_fft_butterfly_rotbufN(Ap, ind+stride2, ind+stride2+1, root, rotbuf, n); } if (k > 3) { mpn_fft_fftR4_twistedRecNeg(Ap, ind, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRecNeg(Ap, ind+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRecNeg(Ap, ind+stride, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); mpn_fft_fftR4_twistedRecNeg(Ap, ind+stride+stride2, k-2, omega, om_curr<<2, om_mult<<2, n, rotbuf); } } static void mpn_fft_fftR4_twistedNeg(mp_ptr * Ap, mp_size_t rk, mp_size_t k1, mp_size_t k, mp_size_t omega, mp_size_t n, mp_ptr *rotbuf) { mpn_fft_fftR4_twistedRecNeg(Ap, 0, k1, omega, 1<<(k-k1), rk, n, rotbuf); } #if 0 /* Radix-2 version of the previous function. Obsolete, now, but more easy to understand; so I let it here. */ static void mpn_fft_fft_twistedRec(mp_ptr * Ap, mp_size_t ind, mp_size_t k, mp_size_t omega, mp_size_t om_curr, mp_size_t om_mult, mp_size_t n, mp_ptr *rotbuf) { const mp_size_t stride = 1<<(k-1); int i; if (k == 0) return; for (i = 0; i < stride; ++i) { mp_size_t root = (omega*(om_curr*i + om_mult)); mpn_fft_butterfly_rotbuf(Ap, ind+i, ind+stride+i, root, rotbuf, n); } mpn_fft_fft_twistedRec(Ap, ind, k-1, omega, om_curr<<1, om_mult<<1, n, rotbuf); mpn_fft_fft_twistedRec(Ap, ind+stride, k-1, omega, om_curr<<1, om_mult<<1, n, rotbuf); } #endif static void mpn_fft_fft_bailey_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t k, mp_size_t omega, mp_size_t nprime, mp_srcptr n, mp_size_t nl, int l, mp_ptr *rotbuf, int b) { const mp_size_t k1 = k >> 1; const mp_size_t k2 = k - k1; int i, j; const mp_size_t K1 = 1 << k1; const mp_size_t K2 = 1 << k2; mp_size_t omegai; mp_ptr *BufA; mp_ptr T, tmp = NULL; const int Kl = l << k; TMP_DECL; TMP_MARK; BufA = TMP_ALLOC_MP_PTRS (K1); ASSERT(BufA != NULL); T = __GMP_ALLOCATE_FUNC_LIMBS(nprime + 1); ASSERT(T != NULL); if (nl > Kl) { tmp = __GMP_ALLOCATE_FUNC_LIMBS(Kl + 1); ASSERT(tmp != NULL); mpn_mul_fft_reduce (tmp, /* A, */ n, nl, Kl, /* l, */ b); n = tmp; nl = Kl + 1; } for (i = 0; i < K2; ++i) { /* Do the decomposition */ /* omega is equal to Mp value */ mpn_mul_fft_decompose (A, Ap, 1<= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) { int k, K2, nprime2, Nprime2, M2, maxLK, l; int **_fft_l; mp_ptr *Ap, *Bp, A, B, T; k = mpn_fft_best_k (n, sqr); K2 = 1 << k; ASSERT_ALWAYS((n & (K2 - 1)) == 0); maxLK = LCM_GMP_NUMB_BITS (k); M2 = MUL_GMP_NUMB_BITS(n) >> k; l = n >> k; Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK; /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/ nprime2 = DIV_GMP_NUMB_BITS (Nprime2); /* Nprime2 / GMP_NUMB_BITS */ /* we should ensure that nprime2 is a multiple of the next K */ if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) { unsigned long K3; for (;;) { K3 = 1L << mpn_fft_best_k (nprime2, sqr); if ((nprime2 & (K3 - 1)) == 0) break; nprime2 = (nprime2 + K3 - 1) & -K3; Nprime2 = nprime2 * GMP_LIMB_BITS; /* warning: since nprime2 changed, K3 may change too! */ } } ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */ Ap = TMP_ALLOC_MP_PTRS (K2); ASSERT(Ap != NULL); Bp = TMP_ALLOC_MP_PTRS (K2); ASSERT(Bp != NULL); A = TMP_ALLOC_LIMBS (2 * K2 * (nprime2 + 1)); ASSERT(A != NULL); T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1)); ASSERT(T != NULL); B = A + K2 * (nprime2 + 1); _fft_l = TMP_ALLOC_TYPE (k + 1, int *); ASSERT(_fft_l != NULL); for (i = 0; i <= k; i++) { _fft_l[i] = TMP_ALLOC_TYPE (1< %d times %dx%d (%1.2f)\n", n, n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); for (i = 0; i < K; i++, ap++, bp++) { mpn_fft_normalize (*ap, n); if (!sqr) mpn_fft_normalize (*bp, n); mpn_mul_fft_internal (*ap, n, *ap, n + 1, *bp, n + 1, k, Ap, Bp, A, B, nprime2, l, _fft_l, T, 1, 1); } } else { mp_ptr a, b, tp, tpn; mp_limb_t cc; int n2 = 2 * n; tp = TMP_ALLOC_LIMBS (n2); ASSERT(tp != NULL); tpn = tp + n; TRACE (printf ("mpn_fft_mul_modF_K: mpn_mul_n %d of %d limbs\n", K, n)); /* FIXME: write a special loop for the square case, to put the test out of the loop, and optimize the case a[n] != 0: maybe normalizing a and b will be faster? */ for (i = 0; i < K; i++) { a = *ap++; b = *bp++; if (LIKELY(a[0] >= a[n])) { a[0] -= a[n]; a[n] = 0; } if (LIKELY(b[0] >= b[n])) { b[0] -= b[n]; b[n] = 0; } if (sqr) mpn_sqr_n (tp, a, n); else mpn_mul_n (tp, b, a, n); cc = a[n] && mpn_add_n (tpn, tpn, b, n); cc += b[n] && mpn_add_n (tpn, tpn, a, n); cc += b[n] && a[n]; /* 0 <= cc <= 3 */ cc += mpn_sub_n (a, tp, tpn, n); /* 0 <= cc <= 4 */ a[n] = 0; MPN_INCR_U (a, n + 1, cc); } } TMP_FREE; } /* * Mix Point-wise multiplication and inverse FFT. * This is useful, since we save one pass on the whole data, thus * improving the locality. * * FIXME: A lot of duplicated code in this function. At some point it * will be necessary to clean-up things to keep it possible to maintain. * */ static void mpn_fft_mul_modF_K_fftInv (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t Mp, int old_k, mp_ptr *rotbuf, int**ll) { int i, j; int sqr = (ap == bp); #if 0 mp_size_t K = 1<= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) { int k, K2, nprime2, Nprime2, M2, maxLK, l; int **_fft_l; mp_ptr *Ap, *Bp, A, B, T; k = mpn_fft_best_k (n, sqr); K2 = 1 << k; ASSERT_ALWAYS((n & (K2 - 1)) == 0); maxLK = LCM_GMP_NUMB_BITS(k); M2 = MUL_GMP_NUMB_BITS(n) >> k; l = n >> k; Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK; /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/ nprime2 = DIV_GMP_NUMB_BITS(Nprime2); /* we should ensure that nprime2 is a multiple of the next K */ if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) { unsigned long K3; for (;;) { K3 = 1L << mpn_fft_best_k (nprime2, sqr); if ((nprime2 & (K3 - 1)) == 0) break; nprime2 = (nprime2 + K3 - 1) & -K3; Nprime2 = nprime2 * GMP_LIMB_BITS; /* warning: since nprime2 changed, K3 may change too! */ } } ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */ Ap = TMP_ALLOC_MP_PTRS (K2); ASSERT(Ap != NULL); Bp = TMP_ALLOC_MP_PTRS (K2); ASSERT(Bp != NULL); A = __GMP_ALLOCATE_FUNC_LIMBS (2 * K2 * (nprime2 + 1)); ASSERT(A != NULL); T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1)); ASSERT(T != NULL); B = A + K2 * (nprime2 + 1); _fft_l = TMP_ALLOC_TYPE (k + 1, int *); ASSERT(_fft_l != NULL); for (i = 0; i <= k; i++) { _fft_l[i] = TMP_ALLOC_TYPE (1< %d times %dx%d (%1.2f)\n", n, n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); { mp_size_t k1, k2, K1, omega, omegai; mp_ptr *BufA; k1 = old_k >> 1; k2 = old_k - k1; #if 0 /* unused variables */ mp_ptr a, b; mp_limb_t cc; mp_size_t N = MUL_4GMP_NUMB_BITS(n); /* 4 * n * GMP_NUMB_BITS */ #endif K1 = 1 << k1; K2 = 1 << k2; /* we overwrite the previous variable, here, but it is no longer used */ omega = Mp; BufA = TMP_ALLOC_MP_PTRS (K1); ASSERT(BufA != NULL); for (i = 0; i < K2; ++i) { /* copy the i-th column of Ap into BufA (pointers... no real copy) */ for (j = 0; j < K1; ++j) { /* Do the point-wise multiplication, the bitreverse and the column selection at once. Should help locality (not readibility). */ int ind = ll[old_k][i+K2*j]; mpn_fft_normalize (ap[ind], n); if (!sqr) mpn_fft_normalize (bp[ind], n); mpn_mul_fft_internal (ap[ind], n, ap[ind], n + 1, bp[ind], n + 1, k, Ap, Bp, A, B, nprime2, l, _fft_l, T, 1, 1); BufA[j] = ap[ind]; } /* do the level k1 transform */ mpn_fft_fftR4_twistedNeg(BufA, i, k1, old_k, omega, n, rotbuf); /* copy back (since with the rotating buffer, the pointers have been moved around. */ for (j = 0; j < K1; ++j) ap[ll[old_k][i+K2*j]] = BufA[j]; } for (i = 0; i < 1<= cc; /* rare case where R = B^n */ } r[n - 1] -= cc; } /* A <- A/sqrt(2)^k mod 2^(n*GMP_NUMB_BITS)+1. Assumes 0 < k < 4*n*GMP_NUMB_BITS. FIXME: can we use the trick used in mpn_fft_div_sqrt2exp_modF above? */ static void mpn_fft_div_sqrt2exp_modF (mp_ptr r, mp_srcptr a, unsigned int k, mp_size_t n) { ASSERT (r != a); #ifdef NO_SQRT_2 ASSERT_ALWAYS (k % 2 == 0); #endif ASSERT (0 < k && k < MUL_4GMP_NUMB_BITS((unsigned int) n)); mpn_fft_mul_sqrt2exp_modF (r, a, MUL_4GMP_NUMB_BITS(n) - k, n); /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */ /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */ mpn_fft_normalize (r, n); } /* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+b, where b=1 or b=-1. Returns carry out, i.e. 1 iff b=1 and {ap,an}=-1 mod 2^(n*GMP_NUMB_BITS)+1, then {rp,n}=0. No restriction on an, except an >= 1. */ static mp_limb_t mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_srcptr ap, mp_size_t an, int b) { if (an <= n) { #ifdef COUNT_ZEROCOPY printf ("mpn_fft_norm_modF: MPN_FFT_COPY with %d limbs\n", an); #endif MPN_COPY (rp, ap, an); if (an < n) MPN_FFT_ZERO (rp + an, n - an); return 0; } else /* an > n */ { mp_size_t l; mp_limb_t cc; int i; l = (an <= 2 * n) ? an - n : n; if (b == -1) cc = mpn_add (rp, ap, n, ap + n, l); else cc = -mpn_sub (rp, ap, n, ap + n, l); ap += n + l; an -= n + l; for (i = -1; an > 0; i = -b * i) { /* it remains to deal with {ap, an} */ l = (an <= n) ? an : n; if (i == -1) cc += mpn_add (rp, rp, n, ap, l); else cc -= mpn_sub (rp, rp, n, ap, l); ap += l; an -= l; } if (b == 1) { if (cc & GMP_LIMB_HIGHBIT) /* cc < 0 */ cc = mpn_add_1 (rp, rp, n, -cc); cc = mpn_sub_1 (rp, rp, n, cc); } else /* b = -1: necessarily cc >= 0 */ cc = mpn_add_1 (rp, rp, n, cc); return mpn_add_1 (rp, rp, n, cc); } } /* op <- n*m mod 2^N+b with fft of size 2^k where N=pl*GMP_NUMB_BITS n and m have respectively nl and ml limbs op must have space for pl+1 limbs if rec=1 (and pl limbs if rec=0). One must have pl = mpn_fft_next_size (pl, k). T must have space for 2 * (nprime + 1) limbs. If rec=0, then store only the pl low bits of the result, and return the out carry. Assumes b=1 (negacyclic convolution) or b=-1 (cyclic convolution). */ static int mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml, int k, mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B, mp_size_t nprime, mp_size_t l, int **_fft_l, mp_ptr T, int rec, int b) { const int K = 1<> k; int i, sqr, pla, lo, sh, j; mp_ptr p; mp_limb_t cc; mp_ptr rotbufA[1], rotbufB[1]; /* we need two rotating buffers, otherwise some Ap[i] may point to the B[] array, and will be erase since we use the B[] array to store the final result {p,pla} */ mp_ptr bufAptr, bufBptr; /* Remember pointers to free memory */ bufAptr = rotbufA[0] = __GMP_ALLOCATE_FUNC_LIMBS(nprime+1); ASSERT(rotbufA[0] != NULL); bufBptr = rotbufB[0] = __GMP_ALLOCATE_FUNC_LIMBS(nprime+1); ASSERT(rotbufB[0] != NULL); ASSERT(b == 1 || b == -1); sqr = n == m && nl == ml; TRACE (printf ("mpn_mul_fft_internal: pl=%d k=%d K=%d np=%d l=%d Mp=%d " "rec=%d sqr=%d b=%d\n", pl,k,K,nprime,l,Mp,rec,sqr,b)); #define BAILEY_THRESHOLD 9 /* direct fft's */ /* This threshold for Bailey's algorithm has been determined experimentally on an Opteron. */ if (k >= BAILEY_THRESHOLD) { TRACE(printf("Calling mpn_fft_fft_bailey(Ap, %d, %d, %d, T, ...)\n", k,Mp,nprime);) /* decomposition of inputs into arrays Ap[i] and Bp[i] */ mpn_fft_fft_bailey_decompose (A, Ap, k, Mp, nprime, n, nl, l, rotbufA, b); if (!sqr) mpn_fft_fft_bailey_decompose (B, Bp, k, Mp, nprime, m, ml, l, rotbufB, b); } else { TRACE(printf("Calling mpn_fft_fft_radix4(Ap, %d, %d, %d, T, ...)\n", k,Mp,nprime);) /* decomposition of inputs into arrays Ap[i] and Bp[i] */ mpn_mul_fft_decompose (A, Ap, K, 0, 0, nprime, n, nl, l, Mp, T, b); if (sqr == 0) mpn_mul_fft_decompose (B, Bp, K, 0, 0, nprime, m, ml, l, Mp, T, b); mpn_fft_fft_radix4 (Ap, k, Mp, nprime, rotbufA); if (!sqr) mpn_fft_fft_radix4 (Bp, k, Mp, nprime, rotbufB); } /* * We want to multipy the K transformed elements of A and B (or A and A * if we're squaring), with products reduced (mod 2^Nprime+1) * * Then we must do the backward transform. * * If we are below Bailey's threshold, we assume that the data fits in * the cache and do those 2 tasks separately. Otherwise we mix them: we * do the point-wise products for the elements of one column, then we * readily do the transform of the column since we have it in cache. * The code becomes messy (especially when you add the bitreverse * stuff), but this saves a bit. */ if (k >= BAILEY_THRESHOLD) { mpn_fft_mul_modF_K_fftInv (Ap, (sqr) ? Ap : Bp, nprime, Mp, k, rotbufA, _fft_l); } else { mpn_fft_mul_modF_K (Ap, (sqr) ? Ap : Bp, nprime, K); TRACE(printf("mpn_mul_fft_internal: Calling mpn_fft_fft_radix4Inv(Ap, %d, " "%d, %d, T, ...)\n", k, Mp, nprime);) mpn_fft_fft_radix4Inv (Ap, k, Mp, nprime, rotbufA, _fft_l); } Bp[0] = T + nprime + 1; /* addition of terms in result p */ MPN_FFT_ZERO (T, nprime + 1); pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */ ASSERT (K * (nprime + 1) >= pla); MPN_FFT_ZERO (p, pla); cc = 0; /* will accumulate the (signed) carry at p[pla] */ for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l) { mp_ptr n = p + sh; j = (K - i) & (K - 1); /* Multiply by appropriate root and reorder. We want to divide by the transform length, so divide by sqrt(2)^(2*k) == 2^k */ if (j > 0 && b == 1) mpn_fft_div_sqrt2exp_modF (Bp[0], Ap[K - j], 2 * k + (K - j) * (Mp / 2), nprime); else /* No unweighting to be done, only divide by transform length */ mpn_fft_div_2exp_modF (Bp[0], Ap[(K - j) & (K - 1)], k, nprime); Bp[j] = Bp[0]; if (mpn_add_n (n, n, Bp[j], nprime + 1)) cc += mpn_add_1 (n + nprime + 1, n + nprime + 1, pla - sh - nprime - 1, ONE); T[2 * l] = (b == 1) ? i + 1 : K; /* T = (i + 1)*2^(2*M) */ if (mpn_cmp (Bp[j], T, nprime + 1) > 0) { /* subtract 2^N'+1 from {n, nprime} */ cc -= mpn_sub_1 (n, n , pla - sh, ONE); cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, ONE); } } if (cc == -ONE) { if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, ONE))) { /* p[pla-pl]...p[pla-1] are all zero */ mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, ONE); mpn_sub_1 (p + pla - 1, p + pla - 1, 1, ONE); } } else if (cc == ONE) { if (pla >= 2 * pl) { while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc))) ; } else { cc = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, cc); ASSERT (cc == 0); } } else { ASSERT (cc == 0); } /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ i = mpn_fft_norm_modF (op, pl, p, pla, b); if (rec) /* store the carry out */ op[pl] = i; __GMP_FREE_FUNC_LIMBS(bufAptr, nprime+1); __GMP_FREE_FUNC_LIMBS(bufBptr, nprime+1); return i; } /* return the lcm of a and 2^k */ static inline unsigned int mpn_mul_fft_lcm (unsigned int a, unsigned int k) { unsigned int l = k; while ((a & 1) == 0 && k > 0) { a >>= 1; k --; } return a << l; } static int mpn_mul_fft_aux (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, int, int); /* put in {op, pl} the low pl limbs of the product {n, nl} * {m, ml} mod (B^pl+1) where B = 2^GMP_NUMB_BITS, and returns the carry bit, which is 1 when {n, nl} * {m, ml} = B^pl, and is 0 otherwise. */ int mpn_mul_fft (mp_ptr op, mp_size_t pl, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml, int k) { return mpn_mul_fft_aux (op, pl, n, nl, m, ml, k, 1); } /* put in {op, pl} the product of {n, nl} * {m, ml} mod (B^pl-1) where B = 2^GMP_NUMB_BITS. */ static int mpn_mul_fft_mersenne (mp_ptr op, mp_size_t pl, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml, int k) { return mpn_mul_fft_aux (op, pl, n, nl, m, ml, k, -1); } /* put in {op, pl} + carry out the product {n, nl} * {m, ml} modulo 2^(pl*GMP_NUMB_BITS) + b, where b = 1 or b = -1. */ static int mpn_mul_fft_aux (mp_ptr op, const mp_size_t pl, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml, int k, const int b) { int maxLK, i, c; const int K = 1 << k; mp_size_t N, Nprime, nprime, M, l; mp_ptr *Ap, *Bp, A, T, B; int **_fft_l; int sqr = (n == m && nl == ml), use_tmp_n, use_tmp_m; TMP_DECL; TRACE (printf ("\nmpn_mul_fft_aux: mpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d " "b=%d\n", pl, nl, ml, k, b)); ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl); TMP_MARK; /* first reduce {n, nl} or {m, ml} if nl > pl or ml > pl */ if ((use_tmp_n = nl > pl)) { mp_ptr nn = __GMP_ALLOCATE_FUNC_LIMBS(pl + (b == 1)); ASSERT(nn != NULL); if ((i = mpn_fft_norm_modF (nn, pl, n, nl, b))) nn[pl] = 1; n = nn; nl = pl + i; } if ((use_tmp_m = ml > pl)) { mp_ptr mm = __GMP_ALLOCATE_FUNC_LIMBS(pl + (b == 1)); ASSERT(mm != NULL); if ((i = mpn_fft_norm_modF (mm, pl, m, ml, b))) mm[pl] = 1; m = mm; ml = pl + i; } /* now nl,ml <= pl if b=-1, nl,ml <= pl+1 if b=1 */ N = MUL_GMP_NUMB_BITS(pl); /* The entire integer product will be mod 2^N+b */ _fft_l = TMP_ALLOC_TYPE (k + 1, int *); ASSERT(_fft_l != NULL); for (i = 0; i <= k; i++) { _fft_l[i] = TMP_ALLOC_TYPE (1 << i, int); ASSERT(_fft_l[i] != NULL); } mpn_fft_initl (_fft_l, k); M = N >> k; /* The number of bits we need to be able to store in each of the 2^k pieces */ l = 1 + DIV_GMP_NUMB_BITS(M - 1); /* nb of limbs in each of the 2^k pieces */ /* Choose maxLK so that an order 4*2^k root of unity exists for the negacyclic transform (which needs a root of unity of order twice the transform length for the weight signal), or an order 2*2^k root of unity for the cyclic transform (which uses no weight signal) */ #ifdef NO_SQRT_2 c = (b == -1) ? 1 : 0; #else c = (b == -1) ? 2 : 1; #endif ASSERT(k >= c); maxLK = LCM_GMP_NUMB_BITS (k - c); /* maxLK = lcm (GMP_NUMB_BITS, 2^(k-1) for b=1, 2^(k-2) for b=-1) */ /* When we do the transforms with elements (mod 2^Nprime+1), we need GMP_NUMB_BITS|Nprime so that shifts are fast, and transformlength|2*c*Nprime so that transformlength|ord(2) for b==1 or transformlength|ord(sqrt(2)) for b==-1 */ Nprime = 2 * M + k + 2; /* make Nprime large enough so that the coefficients in the product polynomial are not affected by reduction (mod 2^Nprime+1). FIXME is the +2 necessary? */ Nprime = (Nprime / maxLK + 1) * maxLK; /* Round up Nprime to multiple of both GMP_NUMB_BITS and 2^(k-1) */ nprime = DIV_GMP_NUMB_BITS(Nprime); /* number of limbs in poly coefficient */ TRACE(printf ("mpn_mul_fft_aux: N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, " "np=%d\n", N, K, M, l, maxLK, Nprime, nprime);) /* we should ensure that recursively, nprime is a multiple of the next K */ if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) { unsigned long K2; for (;;) { K2 = 1L << mpn_fft_best_k (nprime, sqr); if ((nprime & (K2 - 1)) == 0) break; nprime = (nprime + K2 - 1) & -K2; /* round up nprime to multiple of K2 */ Nprime = nprime * GMP_LIMB_BITS; /* warning: since nprime changed, K2 may change too! */ } TRACE (printf ("mpn_mul_fft_aux: new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime)); } ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */ T = __GMP_ALLOCATE_FUNC_LIMBS (2 * (nprime + 1)); ASSERT(T != NULL); TRACE (printf ("mpn_mul_fft_aux: %dx%d limbs -> %d times %dx%d limbs (%1.2f)\n", pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K); printf (" temp space %ld\n", 2 * K * (nprime + 1));) A = __GMP_ALLOCATE_FUNC_LIMBS (2 * K * (nprime + 1)); if (A == NULL) { fprintf (stderr, "Cannot allocate memory, please use -maxmem\n"); exit (EXIT_FAILURE); } B = A + K * (nprime + 1); Ap = TMP_ALLOC_MP_PTRS (K); ASSERT(Ap != NULL); Bp = TMP_ALLOC_MP_PTRS (K); ASSERT(Bp != NULL); i = mpn_mul_fft_internal (op, pl, n, nl, m, ml, k, Ap, Bp, A, B, nprime, l, _fft_l, T, 0, b); TMP_FREE; __GMP_FREE_FUNC_LIMBS (T, 2 * (nprime + 1)); __GMP_FREE_FUNC_LIMBS (A, 2 * K * (nprime + 1)); if (use_tmp_n) __GMP_FREE_FUNC_LIMBS ((mp_ptr) n, pl + (b == 1)); if (use_tmp_m) __GMP_FREE_FUNC_LIMBS ((mp_ptr) m, pl + (b == 1)); return i; } /* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml}, using one modular product mod 2^N-1 and one mod 2^(aN)+1, with a >= 1. */ static void mpn_mul_fft_full_a (mp_ptr op, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml, int a) { mp_size_t pl = nl + ml; /* total number of limbs of the result */ int sqr = n == m && nl == ml; mp_size_t l, h; mp_limb_t muh, cc; int k1, k2, i; mp_ptr tp; l = (pl + a + (a > 1)) / (a + 1); /* ceil(pl/(a+1)) */ /* Warning: for a > 1, the product may be larger than (2^N-1) * (2^(aN)+1), thus we take one extra limb. */ k1 = mpn_fft_best_k (l, 2 + sqr); /* for 2^N-1 */ k2 = mpn_fft_best_k (a * l, sqr); /* for 2^(aN)+1 */ /* we must have l multiple of 2^k1 and a*l multiple of 2^k2. FIXME: the optimal k1 and k2 values might change in the while loop. */ while (1) { h = mpn_fft_next_size (l, k1); if (h != l) l = h; else { h = mpn_fft_next_size (a * l, k2); if (h != a * l) l = (h + a - 1) / a; /* ceil(h/a) */ else break; } } h = a * l; /* now mpn_fft_next_size (l, k1) = l and mpn_fft_next_size (h, k2) = h with h = a * l */ /* we perform one FFT mod 2^(aN)+1 and one mod 2^N-1. Let P = n * m. Assume P = lambda * (2^(aN)+1) + mu, with 0 <= mu < 2^(aN)+1, and 0 <= lambda < 2^N-1. Then P = mu mod (2^(aN)+1) and P = 2*lambda+mu mod (2^N-1). Let A := P mod (2^(aN)+1) and B := P mod (2^N-1), with 0 <= A < 2^(aN)+1 and 0 <= B < 2^N-1. Then mu = A, and lambda = (B-A)/2 mod (2^N-1). */ ASSERT_ALWAYS(h < pl); muh = mpn_mul_fft (op, h, n, nl, m, ml, k2); /* mu = muh+{op,h} */ tp = __GMP_ALLOCATE_FUNC_LIMBS (l); ASSERT (tp != NULL); mpn_mul_fft_mersenne (tp, l, n, nl, m, ml, k1); /* B */ /* now compute B-A mod 2^N-1, where B = {tp, l}, and A = cc + {op, h} */ for (cc = muh, i = 0; i < a; i++) cc += mpn_sub_n (tp, tp, op + i * l, l); /* cc is a borrow at tp[0] */ while (cc > 0) /* add cc*(2^N-1): if cc=1 after the first loop, then tp[l-1] = 111...111, and cc=0 after the 2nd loop */ cc = mpn_sub_1 (tp, tp, l, cc); /* Check whether {tp,l} = 111...111, in which case we should reduce it to 000...000. */ for (i = 0; i < l && ~tp[i] == 0; i++); if (i == l) mpn_add_1 (tp, tp, l, 1); /* reduces {tp,l} to 000...000 */ /* make cc + {tp, l} even, and divide by 2 */ if (tp[0] & (mp_limb_t) 1) cc = 1 - mpn_sub_1 (tp, tp, l, 1); /* add 2^N-1 */ /* now we have to compute lambda * (2^(aN)+1) + mu, where 2*lambda = {tp, l} and mu = muh + {op, h} */ mpn_rshift (op + h, tp, pl - h, 1); /* divide by 2 to obtain lambda */ if (pl < l + h) /* i.e. pl - h < l: it remains high limbs in {tp, l} */ { /* since the product is P = lambda * (2^N+1) + mu, if cc=1, the product would exceed pl < h+l limbs */ ASSERT_ALWAYS (cc == 0); cc = tp[pl - h] & 1; } op[pl - 1] |= cc << (GMP_NUMB_BITS - 1); __GMP_FREE_FUNC_LIMBS (tp, l); /* since n * m has at most pl limbs, the high part of lambda should be 0 */ cc = mpn_add_n (op, op, op + h, pl - h); /* add lambda to mu */ MPN_INCR_U (op + pl - h, h, cc); MPN_INCR_U (op + h, pl - h, muh); } /* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */ void mpn_mul_fft_full (mp_ptr op, mp_srcptr n, mp_size_t nl, mp_srcptr m, mp_size_t ml) { #ifndef MUL_FFT_FULL_TABLE2 mpn_mul_fft_full_a (op, n, nl, m, ml, 1); #else int a = mpn_fft_best_a ((nl + ml) / 2, n == m && nl == ml); mpn_mul_fft_full_a (op, n, nl, m, ml, a); #endif return; } ecm-6.4.4/test.ecm0000755023561000001540000002351312106741273010672 00000000000000#!/bin/sh # test file for ECM # # Copyright 2002, 2003, 2004, 2005, 2006, 2008, 2009, 2011, 2012 # Jim Fougeron, Alexander Kruppa, Dave Newman, Paul Zimmermann, Cyril Bouvier, # David Cleaver. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, see # http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., # 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. ECM="$1" # Call with "checkcode $? n" to check that return code is n # (see test.pm1 for the explanation of the different return codes) checkcode () { if [ $1 != $2 ] then echo "############### ERROR ###############" echo "Expected return code $2 but got $1" exit 1 fi } # Exit statues returned by GMP-ECM: # 0 Normal program termination, no factor found # 1 Error # 2 Composite factor found, cofactor is composite # 6 Probable prime factor found, cofactor is composite # 8 Input number found # 10 Composite factor found, cofactor is a probable prime # 14 Probable prime factor found, cofactor is a probable prime BATCH0="" # try primes < d in stage 2. Curve with sigma=7, mod 30210181 has order # 2^4 * 3^3 * 29 * 2411 echo 2050449353925555290706354283 | $ECM $BATCH0 -sigma 7 -k 1 30 0-1e6; checkcode $? 14 # check the -treefile option echo 2050449353925555290706354283 | $ECM $BATCH0 -treefile tree -sigma 7 -k 1 30 0-1e6; checkcode $? 14 # Check a stage 2 of length 1. g1=1822795201 g2=968809 g3=567947 echo 212252637915375215854013140804296246361 | $ECM $BATCH0 -sigma 781683988 -go 550232165123 63421 1822795201-1822795201; checkcode $? 8 # tests from Torbjo"rn Granlund echo 137703491 | $ECM $BATCH0 -sigma 6 84 1000; checkcode $? 8 echo 3533000986701102061387017352606588294716061 | $ECM $BATCH0 -sigma 1621 191 225; checkcode $? 14 echo 145152979917007299777325725119 | $ECM $BATCH0 -sigma 711387948 924 117751; checkcode $? 14 # Test a few base 2 numbers. These tests are fairly quick. # Test a 2^n-1 number, factor found in stage 1. Order mod 33554520197234177 # with sigma=262763035 is 2^3*3*5*47*59*241*601*743*937 echo "2^919-1" | $ECM $BATCH0 -sigma 262763035 937 1; checkcode $? 6 # Test a 2^n-1 number, factor found in stage 2. Order mod 33554520197234177 # with sigma=1691973485 is 2^6*3*11*29*59*73*263*283*1709 echo "2^919-1" | $ECM $BATCH0 -sigma 1691973485 283 1709; checkcode $? 6 # Test a 2^n+1 number, factor found in stage 1. Order mod 24651922299337 # with sigma=2301432245 is 2^3*3^3*5^2*7^2*17*67*157*521 echo "(2^1033+1)/3" | $ECM $BATCH0 -sigma 2301432245 521 1; checkcode $? 6 # Test a 2^n+1 number, factor found in stage 2. Order mod 24651922299337 # with sigma=2394040080 is 2^2*3^2*13*19*53*127*223*1847 echo "(2^1033+1)/3" | $ECM $BATCH0 -sigma 2301432245 223 1847; checkcode $? 6 # Test another 2^n+1 number, with a larger known factor divided out. # Factor found in stage 1, order mod 114584129081 with sigma=2399424618 # is 2^9*3^2*5^2*7^2*53*383 echo "(2^1063+1)/3/26210488518118323164267329859" | $ECM $BATCH0 -sigma 2399424618 383 1 ; checkcode $? 6 # Like last one, but factor found in stage 2 echo "(2^1063+1)/3/26210488518118323164267329859" | $ECM $BATCH0 -sigma 2399424618 71 500; checkcode $? 6 echo 242668358425701966181147598421249782519178289604307455138484425562807899 | $ECM $BATCH0 -sigma 1417477358 28560 8e7-85507063; checkcode $? 14 # bug found by Jim Fougeron echo 3533000986701102061387017352606588294716061 | $ECM $BATCH0 -sigma 291310394389387 191 225; checkcode $? 14 echo 121279606270805899614487548491773862357 | $ECM $BATCH0 -sigma 1931630101 120; checkcode $? 14 echo 291310394389387 | $ECM $BATCH0 -power 3 -sigma 40 2000; checkcode $? 8 echo 3533000986701102061387017352606588294716061 | $ECM $BATCH0 -sigma 3547 167 211; checkcode $? 14 # test -go option echo 449590253344339769860648131841615148645295989319968106906219761704350259884936939123964073775456979170209297434164627098624602597663490109944575251386017 | $ECM $BATCH0 -sigma 63844855 -go 172969 61843 20658299; checkcode $? 14 echo 17061648125571273329563156588435816942778260706938821014533 | $ECM $BATCH0 -sigma 585928442 174000; checkcode $? 14 echo 89101594496537524661600025466303491594098940711325290746374420963129505171895306244425914080753573576861992127359576789001 | $ECM $BATCH0 -sigma 877655087 -go 325001 157721 1032299; checkcode $? 14 echo 5394204444759808120647321820789847518754252780933425517607611172590240019087317088600360602042567541009369753816111824690753627535877960715703346991252857 | $ECM $BATCH0 -sigma 805816989 -go 345551 149827; checkcode $? 6 echo 3923385745693995079670229419275984584311007321932374190635656246740175165573932140787529348954892963218868359081838772941945556717 | $ECM $BATCH0 -sigma 876329474 141667 150814537; checkcode $? 14 echo 124539923134619429718018353168641490719788526741873602224103589351798060075728544650990190016536810151633233676972068237330360238752628542584228856301923448951 | $ECM $BATCH0 -sigma 1604840403 -go "1260317*1179109*661883" 96097 24289207; checkcode $? 14 # p49 found by Sean Irvine echo 4983070578699621345648758795946786489699447158923341167929707152021191319057138908604417894224244096909460401007237133698775496719078793168004317119431646035122982915288481052088094940158965731422616671 | $ECM $BATCH0 -sigma 909010734 122861 176711; checkcode $? 6 # bug in ecm-5.0 (overflow in fin_diff_coeff) echo 1408323592065265621229603282020508687 | $ECM $BATCH0 -sigma 1549542516 -go 2169539 531571 29973883000-29973884000; checkcode $? 8 # bug in ecm 5.0 and 5.0.1 (factor found for c110 input, not with p58) echo 3213162276640339413566047915418064969550383692549981333701 | $ECM $BATCH0 -sigma 2735675386 -go 1615843 408997 33631583; checkcode $? 8 echo 39614081257132168796771975177 | $ECM $BATCH0 -sigma 480 1e6; checkcode $? 8 echo 10000286586958753753 | $ECM $BATCH0 -sigma 3956738175 1e6; checkcode $? 8 echo 49672383630046506169472128421 | $ECM $BATCH0 -sigma 2687434659 166669 86778487; checkcode $? 8 echo 216259730493575791390589173296092767511 | $ECM $BATCH0 -sigma 214659179 1124423 20477641; checkcode $? 8 # bug reported by Allan Steel on 14 March 2006 echo 49367108402201032092269771894422156977426293789852367266303146912244441959559870316184237 | $ECM $BATCH0 -sigma 6 5000; checkcode $? 0 # A test with a larger input number to test modular arithmetic routines not # in mulredc*.asm. This input has 1363 bits so it has 22 64 bit words # (43 32 bit words) and cannot use mulredc which handles only up to 20 limbs echo "10090030271*10^400+696212088699" | $ECM $BATCH0 -sigma 3923937547 1e3 1e6; checkcode $? 14 # To test batch mode 1 # the following test works both on 32- and 64-bit machines # on 32-bit machines it corresponds to d' = 42 # on 64-bit machines it corresponds to d' = 42*2^32 echo 458903930815802071188998938170281707063809443792768383215233 | $ECM -batch -A 103699173453039012668349162616750601868936199904547322268878 10000 checkcode $? 14 # same with batch=2 echo 458903930815802071188998938170281707063809443792768383215233 | $ECM -batch=2 -A 103699173453039012668349162616750601868936199904547322268878 10000 checkcode $? 14 # this test corresponds to d'=13 on 32-bit, 13*2^32 on 64-bit echo "2^349-1" | $ECM -batch -A 13883915733485915535567641090102088744917579395318243004655770450844428217574163575149253565087742 587 29383 checkcode $? 6 # same with batch=2 echo "2^349-1" | $ECM -batch=2 -A 13883915733485915535567641090102088744917579395318243004655770450844428217574163575149253565087742 587 29383 checkcode $? 6 # another batch-mode test (d' = 1097 on 32-bit, 1097*2^32 on 64-bit) echo "2^347-1" | $ECM -batch -A 292897222300654795048417351458499833714895857628156011078988080472621879897670335421898676171177982 3301 229939 checkcode $? 14 # same with batch=2 echo "2^347-1" | $ECM -batch=2 -A 292897222300654795048417351458499833714895857628156011078988080472621879897670335421898676171177982 3301 229939 checkcode $? 14 # To test batch mode 2 echo 911962091 | $ECM -batch=2 -A 440688534 50000 checkcode $? 8 echo 31622776601683791911 | $ECM -batch=1 -A 27063318473587686303 11000 checkcode $? 0 # non-regression test for bug fixed by changeset r1819 on 64-bit # (this also produces a small d' on 32-bit, thus can be used with batch=1) echo 18446744073709551557 | $ECM -batch -A 312656731337392125 11000 checkcode $? 8 # non-regression test for bug fixed by changeset r1819 on 32-bit echo 4294967291 | $ECM -batch -A 17 1000 checkcode $? 8 # this example would fail with the old Fgw.c when using gwnum (fixed by David Cleaver, r1734) echo "((173^173+1)/174)/471462511391940575680645418941" | $ECM $BATCH0 -sigma 12345 20000 checkcode $? 0 # this test was failing on gcc45.fsffrance.org with 6.4.1 echo "((173^173+1)/174)/471462511391940575680645418941+122" | $ECM $BATCH0 -sigma 77 20000 checkcode $? 6 # the following tests should work on machines which have uint64_t or where # unsigned long long has 64 bits (exercises patch from David Cleaver, r1575) echo "NOTE: NEXT 3 TESTS WILL FAIL ON SOME 32BIT MACHINES, THIS IS EXPECTED." echo 10000000000000000000000000000000000000121 | $ECM $BATCH0 -sigma 61 -go 1195504287780095287 2950307; checkcode $? 8 echo 10000000000000000000000000000000000000121 | $ECM $BATCH0 -sigma 266 -go 218187387944803649 9405629; checkcode $? 8 echo 10000000000000000000000000000000000000121 | $ECM $BATCH0 -sigma 291 -go 5994496018878137 4372759; checkcode $? 8 echo "All ECM tests are ok." ecm-6.4.4/spv.c0000644023561000001540000003211712106741273010176 00000000000000/* spv.c - "small prime vector" functions for arithmetic on vectors of residues modulo a single small prime Copyright 2005, 2006, 2007, 2008, 2009 Dave Newman, Jason Papadopoulos, Brian Gladman, Alexander Kruppa, Paul Zimmermann. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include /* for memset */ #include "sp.h" /* Routines for vectors of integers modulo r common small prime * * These are low-overhead routines that don't do memory allocation, * other than for temporary variables. Unless otherwise specified, any * of the input pointers can be equal. */ /* r = x */ void spv_set (spv_t r, spv_t x, spv_size_t len) { #ifdef HAVE_MEMMOVE /* memmove doesn't rely on the assertion below */ memmove (r, x, len * sizeof (sp_t)); #else spv_size_t i; ASSERT (r >= x + len || x >= r); for (i = 0; i < len; i++) r[i] = x[i]; #endif } /* r[0 ... len - 1] = x[len - 1 ... 0] */ void spv_rev (spv_t r, spv_t x, spv_size_t len) { spv_size_t i; ASSERT (r >= x + len || x >= r + len); for (i = 0; i < len; i++) r[i] = x[len - 1 - i]; } /* r = [y, y, ... ] */ void spv_set_sp (spv_t r, sp_t y, spv_size_t len) { spv_size_t i; for (i = 0; i < len; i++) r[i] = y; } void spv_set_zero (spv_t r, spv_size_t len) { memset (r, 0, len * sizeof (sp_t)); } int spv_cmp (spv_t x, spv_t y, spv_size_t len) { spv_size_t i; for (i = 0; i < len; i++) if (x[i] != y[i]) return 1; return 0; } /* r = x + y */ void spv_add (spv_t r, spv_t x, spv_t y, spv_size_t len, sp_t m) { spv_size_t i; ASSERT (r >= x + len || x >= r); ASSERT (r >= y + len || y >= r); for (i = 0; i < len; i++) r[i] = sp_add (x[i], y[i], m); } /* r = [x[0] + y, x[1] + y, ... ] */ void spv_add_sp (spv_t r, spv_t x, sp_t c, spv_size_t len, sp_t m) { spv_size_t i; for (i = 0; i < len; i++) r[i] = sp_add (x[i], c, m); } /* r = x - y */ void spv_sub (spv_t r, spv_t x, spv_t y, spv_size_t len, sp_t m) { spv_size_t i; ASSERT (r >= x + len || x >= r); ASSERT (r >= y + len || y >= r); for (i = 0; i < len; i++) r[i] = sp_sub (x[i], y[i], m); } /* r = [x[0] - y, x[1] - y, ... ] */ void spv_sub_sp (spv_t r, spv_t x, sp_t c, spv_size_t len, sp_t m) { spv_size_t i; for (i = 0; i < len; i++) r[i] = sp_sub (x[i], c, m); } /* r = [-x[0], -x[1], ... ] */ void spv_neg (spv_t r, spv_t x, spv_size_t len, sp_t m) { spv_size_t i; for (i = 0; i < len; i++) r[i] = sp_sub (0, x[i], m); } /* Pointwise multiplication * r = [x[0] * y[0], x[1] * y[1], ... ] */ void spv_pwmul (spv_t r, spv_t x, spv_t y, spv_size_t len, sp_t m, sp_t d) { spv_size_t i = 0; ASSERT (r >= x + len || x >= r); ASSERT (r >= y + len || y >= r); #if (defined(__GNUC__) || defined(__ICL)) && \ defined(__i386__) && defined(HAVE_SSE2) asm volatile ( "movd %6, %%xmm6 \n\t" "pshufd $0x44, %%xmm6, %%xmm5 \n\t" "pshufd $0, %%xmm6, %%xmm6 \n\t" "movd %7, %%xmm7 \n\t" "pshufd $0, %%xmm7, %%xmm7 \n\t" "0: \n\t" "movdqa (%1,%4,4), %%xmm0 \n\t" "movdqa (%2,%4,4), %%xmm2 \n\t" "pshufd $0x31, %%xmm0, %%xmm1\n\t" "pshufd $0x31, %%xmm2, %%xmm3\n\t" "pmuludq %%xmm2, %%xmm0 \n\t" "pmuludq %%xmm3, %%xmm1 \n\t" "movdqa %%xmm0, %%xmm2 \n\t" "movdqa %%xmm1, %%xmm3 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm2 \n\t" "pmuludq %%xmm7, %%xmm2 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm3 \n\t" "pmuludq %%xmm7, %%xmm3 \n\t" #if SP_NUMB_BITS < W_TYPE_SIZE - 1 "psrlq $33, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "psrlq $33, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" #else "pshufd $0xf5, %%xmm2, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "pshufd $0xf5, %%xmm3, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" "psubq %%xmm5, %%xmm0 \n\t" "psubq %%xmm5, %%xmm1 \n\t" "pshufd $0xf5, %%xmm0, %%xmm2 \n\t" "pshufd $0xf5, %%xmm1, %%xmm3 \n\t" "pand %%xmm5, %%xmm2 \n\t" "pand %%xmm5, %%xmm3 \n\t" "paddq %%xmm2, %%xmm0 \n\t" "paddq %%xmm3, %%xmm1 \n\t" #endif "pshufd $0x8, %%xmm0, %%xmm0 \n\t" "pshufd $0x8, %%xmm1, %%xmm1 \n\t" "punpckldq %%xmm1, %%xmm0 \n\t" "psubd %%xmm6, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" "pcmpgtd %%xmm0, %%xmm1 \n\t" "pand %%xmm6, %%xmm1 \n\t" "paddd %%xmm1, %%xmm0 \n\t" "movdqa %%xmm0, (%3,%4,4) \n\t" "addl $4, %4 \n\t" /* INC */ "cmpl %5, %4 \n\t" "jne 0b \n\t" :"=r"(i) :"r"(x), "r"(y), "r"(r), "0"(i), "g"(len & (spv_size_t)(~3)), "g"(m), "g"(d) :"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"); #elif defined( _MSC_VER ) && defined( SSE2) __asm { push esi push edi mov edi, x mov esi, y mov edx, r xor ecx, ecx mov eax, len and eax, ~3 movd xmm6, m pshufd xmm5, xmm6, 0x44 pshufd xmm6, xmm6, 0 movd xmm7, d pshufd xmm7, xmm7, 0 L0: movdqa xmm0, [edi+ecx*4] movdqa xmm2, [esi+ecx*4] pshufd xmm1, xmm0, 0x31 pshufd xmm3, xmm2, 0x31 pmuludq xmm0, xmm2 pmuludq xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 psrlq xmm2, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm2, xmm7 psrlq xmm3, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm3, xmm7 #if SP_NUMB_BITS < W_TYPE_SIZE - 1 psrlq xmm2, 33 pmuludq xmm2, xmm6 psrlq xmm3, 33 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 #else pshufd xmm2, xmm2, 0xf5 pmuludq xmm2, xmm6 pshufd xmm3, xmm3, 0xf5 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 psubq xmm0, xmm5 psubq xmm1, xmm5 pshufd xmm2, xmm0, 0xf5 pshufd xmm3, xmm1, 0xf5 pand xmm2, xmm5 pand xmm3, xmm5 paddq xmm0, xmm2 paddq xmm1, xmm3 #endif pshufd xmm0, xmm0, 0x8 pshufd xmm1, xmm1, 0x8 punpckldq xmm0, xmm1 psubd xmm0, xmm6 pxor xmm1, xmm1 pcmpgtd xmm1, xmm0 pand xmm1, xmm6 paddd xmm0, xmm1 movdqa [edx+ecx*4], xmm0 add ecx, 4 cmp eax, ecx jne L0 mov i, ecx pop edi pop esi } #endif for (; i < len; i++) r[i] = sp_mul (x[i], y[i], m, d); } /* Pointwise multiplication, second input is read in reverse * r = [x[0] * y[len - 1], x[1] * y[len - 2], ... x[len - 1] * y[0]] */ void spv_pwmul_rev (spv_t r, spv_t x, spv_t y, spv_size_t len, sp_t m, sp_t d) { spv_size_t i; ASSERT (r >= x + len || x >= r); ASSERT (r >= y + len || y >= r); for (i = 0; i < len; i++) r[i] = sp_mul (x[i], y[len - 1 - i], m, d); } /* dst = src * y */ void spv_mul_sp (spv_t r, spv_t x, sp_t c, spv_size_t len, sp_t m, sp_t d) { spv_size_t i = 0; ASSERT (r >= x + len || x >= r); #if (defined(__GNUC__) || defined(__ICL)) && \ defined(__i386__) && defined(HAVE_SSE2) asm volatile ( "movd %2, %%xmm4 \n\t" "pshufd $0, %%xmm4, %%xmm4 \n\t" "movd %6, %%xmm6 \n\t" "pshufd $0x44, %%xmm6, %%xmm5 \n\t" "pshufd $0, %%xmm6, %%xmm6 \n\t" "movd %7, %%xmm7 \n\t" "pshufd $0, %%xmm7, %%xmm7 \n\t" "0: \n\t" "movdqa (%1,%4,4), %%xmm0 \n\t" "pshufd $0x31, %%xmm0, %%xmm1\n\t" "pshufd $0x31, %%xmm4, %%xmm3\n\t" "pmuludq %%xmm4, %%xmm0 \n\t" "pmuludq %%xmm3, %%xmm1 \n\t" "movdqa %%xmm0, %%xmm2 \n\t" "movdqa %%xmm1, %%xmm3 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm2 \n\t" "pmuludq %%xmm7, %%xmm2 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm3 \n\t" "pmuludq %%xmm7, %%xmm3 \n\t" #if SP_NUMB_BITS < W_TYPE_SIZE - 1 "psrlq $33, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "psrlq $33, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" #else "pshufd $0xf5, %%xmm2, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "pshufd $0xf5, %%xmm3, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" "psubq %%xmm5, %%xmm0 \n\t" "psubq %%xmm5, %%xmm1 \n\t" "pshufd $0xf5, %%xmm0, %%xmm2 \n\t" "pshufd $0xf5, %%xmm1, %%xmm3 \n\t" "pand %%xmm5, %%xmm2 \n\t" "pand %%xmm5, %%xmm3 \n\t" "paddq %%xmm2, %%xmm0 \n\t" "paddq %%xmm3, %%xmm1 \n\t" #endif "pshufd $0x8, %%xmm0, %%xmm0 \n\t" "pshufd $0x8, %%xmm1, %%xmm1 \n\t" "punpckldq %%xmm1, %%xmm0 \n\t" "psubd %%xmm6, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" "pcmpgtd %%xmm0, %%xmm1 \n\t" "pand %%xmm6, %%xmm1 \n\t" "paddd %%xmm1, %%xmm0 \n\t" "movdqa %%xmm0, (%3,%4,4) \n\t" "addl $4, %4 \n\t" /* INC */ "cmpl %5, %4 \n\t" "jne 0b \n\t" :"=r"(i) :"r"(x), "g"(c), "r"(r), "0"(i), "g"(len & (spv_size_t)(~3)), "g"(m), "g"(d) :"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"); #elif defined( _MSC_VER ) && defined( SSE2) __asm { push esi push edi xor ecx, ecx mov edi, x mov esi, c mov edx, r mov eax, len and eax, ~3 movd xmm4, esi pshufd xmm4, xmm4, 0 movd xmm6, m pshufd xmm5, xmm6, 0x44 pshufd xmm6, xmm6, 0 movd xmm7, d pshufd xmm7, xmm7, 0 L0: movdqa xmm0, [edi+ecx*4] pshufd xmm1, xmm0, 0x31 pshufd xmm3, xmm4, 0x31 pmuludq xmm0, xmm4 pmuludq xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 psrlq xmm2, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm2, xmm7 psrlq xmm3, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm3, xmm7 #if SP_NUMB_BITS < W_TYPE_SIZE - 1 psrlq xmm2, 33 pmuludq xmm2, xmm6 psrlq xmm3, 33 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 #else pshufd xmm2, xmm2, 0xf5 pmuludq xmm2, xmm6 pshufd xmm3, xmm3, 0xf5 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 psubq xmm0, xmm5 psubq xmm1, xmm5 pshufd xmm2, xmm0, 0xf5 pshufd xmm3, xmm1, 0xf5 pand xmm2, xmm5 pand xmm3, xmm5 paddq xmm0, xmm2 paddq xmm1, xmm3 #endif pshufd xmm0, xmm0, 0x8 pshufd xmm1, xmm1, 0x8 punpckldq xmm0, xmm1 psubd xmm0, xmm6 pxor xmm1, xmm1 pcmpgtd xmm1, xmm0 pand xmm1, xmm6 paddd xmm0, xmm1 movdqa [edx+ecx*4], xmm0 add ecx, 4 cmp eax, ecx jne L0 mov i, ecx pop edi pop esi } #endif for (; i < len; i++) r[i] = sp_mul (x[i], c, m, d); } void spv_random (spv_t x, spv_size_t len, sp_t m) { spv_size_t i; mpn_random (x, len); for (i = 0; i < len; i++) while (x[i] >= m) x[i] -= m; } ecm-6.4.4/ecm.c0000644023561000001540000012217012106741273010131 00000000000000/* Elliptic Curve Method: toplevel and stage 1 routines. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Paul Zimmermann, Alexander Kruppa, Cyril Bouvier, David Cleaver. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-impl.h" #include #ifdef HAVE_LIMITS_H # include #else # define ULONG_MAX __GMP_ULONG_MAX #endif /* the following factor takes into account the smaller expected smoothness for Montgomery's curves (batch mode) with respect to Suyama's curves */ #if GMP_NUMB_BITS >= 64 /* For GMP_NUMB_BITS >= 64 we use A=4d-2 with d a square (see main.c). In that case, Cyril Bouvier and Razvan Barbulescu have shown that the average expected torsion is that of a generic Suyama curve multiplied by the constant 2^(1/3)/(3*3^(1/128)) */ #define BATCH1_EXTRA_SMOOTHNESS 0.416384512396064 #else /* For A=4d-2 for d a random integer, the average expected torsion is that of a generic Suyama curve multiplied by the constant 1/(3*3^(1/128)) */ #define BATCH1_EXTRA_SMOOTHNESS 0.330484606500389 #endif /****************************************************************************** * * * Elliptic Curve Method * * * ******************************************************************************/ void duplicate (mpres_t, mpres_t, mpres_t, mpres_t, mpmod_t, mpres_t, mpres_t, mpres_t, mpres_t) ATTRIBUTE_HOT; void add3 (mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpmod_t, mpres_t, mpres_t, mpres_t) ATTRIBUTE_HOT; #define mpz_mulmod5(r,s1,s2,m,t) { mpz_mul(t,s1,s2); mpz_mod(r, t, m); } /* Computes curve parameter A and a starting point (x:1) from a given sigma value. If a factor of n was found during the process, returns ECM_FACTOR_FOUND_STEP1 (and factor in f), returns ECM_NO_FACTOR_FOUND otherwise. */ static int get_curve_from_sigma (mpz_t f, mpres_t A, mpres_t x, mpz_t sigma, mpmod_t n) { mpres_t t, u, v, b, z; MEMORY_TAG; mpres_init (t, n); MEMORY_TAG; mpres_init (u, n); MEMORY_TAG; mpres_init (v, n); MEMORY_TAG; mpres_init (b, n); MEMORY_TAG; mpres_init (z, n); MEMORY_UNTAG; mpres_set_z (u, sigma, n); mpres_mul_ui (v, u, 4, n); /* v = (4*sigma) mod n */ mpres_sqr (t, u, n); mpres_sub_ui (u, t, 5, n); /* u = (sigma^2-5) mod n */ mpres_sqr (t, u, n); mpres_mul (x, t, u, n); /* x = (u^3) mod n */ mpres_sqr (t, v, n); mpres_mul (z, t, v, n); /* z = (v^3) mod n */ mpres_mul (t, x, v, n); mpres_mul_ui (b, t, 4, n); /* b = (4*x*v) mod n */ mpres_mul_ui (t, u, 3, n); mpres_sub (u, v, u, n); /* u' = v-u */ mpres_add (v, t, v, n); /* v' = (3*u+v) mod n */ mpres_sqr (t, u, n); mpres_mul (u, t, u, n); /* u'' = ((v-u)^3) mod n */ mpres_mul (A, u, v, n); /* a = (u'' * v') mod n = ((v-u)^3 * (3*u+v)) mod n */ /* Normalize b and z to 1 */ mpres_mul (v, b, z, n); if (!mpres_invert (u, v, n)) /* u = (b*z)^(-1) (mod n) */ { mpres_gcd (f, v, n); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (b, n); mpres_clear (z, n); return ECM_FACTOR_FOUND_STEP1; } mpres_mul (v, u, b, n); /* v = z^(-1) (mod n) */ mpres_mul (x, x, v, n); /* x = x * z^(-1) */ mpres_mul (v, u, z, n); /* v = b^(-1) (mod n) */ mpres_mul (t, A, v, n); mpres_sub_ui (A, t, 2, n); mpres_clear (t, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (b, n); mpres_clear (z, n); return ECM_NO_FACTOR_FOUND; } /* switch from Montgomery's form g*y^2 = x^3 + a*x^2 + x to Weierstrass' form Y^2 = X^3 + A*X + B by change of variables x -> g*X-a/3, y -> g*Y. We have A = (3-a^2)/(3g^2), X = (3x+a)/(3g), Y = y/g. If a factor is found during the modular inverse, returns ECM_FACTOR_FOUND_STEP1 and the factor in f, otherwise returns ECM_NO_FACTOR_FOUND. */ static int montgomery_to_weierstrass (mpz_t f, mpres_t x, mpres_t y, mpres_t A, mpmod_t n) { mpres_t g; MEMORY_TAG; mpres_init (g, n); MEMORY_UNTAG; mpres_add (g, x, A, n); mpres_mul (g, g, x, n); mpres_add_ui (g, g, 1, n); mpres_mul (g, g, x, n); /* g = x^3+a*x^2+x (y=1) */ mpres_mul_ui (y, g, 3, n); mpres_mul (y, y, g, n); /* y = 3g^2 */ if (!mpres_invert (y, y, n)) /* y = 1/(3g^2) temporarily */ { mpres_gcd (f, y, n); mpres_clear (g, n); return ECM_FACTOR_FOUND_STEP1; } /* update x */ mpres_mul_ui (x, x, 3, n); /* 3x */ mpres_add (x, x, A, n); /* 3x+a */ mpres_mul (x, x, g, n); /* (3x+a)*g */ mpres_mul (x, x, y, n); /* (3x+a)/(3g) */ /* update A */ mpres_sqr (A, A, n); /* a^2 */ mpres_sub_ui (A, A, 3, n); mpres_neg (A, A, n); /* 3-a^2 */ mpres_mul (A, A, y, n); /* (3-a^2)/(3g^2) */ /* update y */ mpres_mul_ui (g, g, 3, n); /* 3g */ mpres_mul (y, y, g, n); /* (3g)/(3g^2) = 1/g */ mpres_clear (g, n); return ECM_NO_FACTOR_FOUND; } /* adds Q=(x2:z2) and R=(x1:z1) and puts the result in (x3:z3), using 6 muls (4 muls and 2 squares), and 6 add/sub. One assumes that Q-R=P or R-Q=P where P=(x:z). - n : number to factor - u, v, w : auxiliary variables Modifies: x3, z3, u, v, w. (x3,z3) may be identical to (x2,z2) and to (x,z) */ void add3 (mpres_t x3, mpres_t z3, mpres_t x2, mpres_t z2, mpres_t x1, mpres_t z1, mpres_t x, mpres_t z, mpmod_t n, mpres_t u, mpres_t v, mpres_t w) { mpres_sub (u, x2, z2, n); mpres_add (v, x1, z1, n); /* u = x2-z2, v = x1+z1 */ mpres_mul (u, u, v, n); /* u = (x2-z2)*(x1+z1) */ mpres_add (w, x2, z2, n); mpres_sub (v, x1, z1, n); /* w = x2+z2, v = x1-z1 */ mpres_mul (v, w, v, n); /* v = (x2+z2)*(x1-z1) */ mpres_add (w, u, v, n); /* w = 2*(x1*x2-z1*z2) */ mpres_sub (v, u, v, n); /* v = 2*(x2*z1-x1*z2) */ mpres_sqr (w, w, n); /* w = 4*(x1*x2-z1*z2)^2 */ mpres_sqr (v, v, n); /* v = 4*(x2*z1-x1*z2)^2 */ if (x == x3) /* same variable: in-place variant */ { /* x3 <- w * z mod n z3 <- x * v mod n */ mpres_mul (z3, w, z, n); mpres_mul (x3, x, v, n); mpres_swap (x3, z3, n); } else { mpres_mul (x3, w, z, n); /* x3 = 4*z*(x1*x2-z1*z2)^2 mod n */ mpres_mul (z3, x, v, n); /* z3 = 4*x*(x2*z1-x1*z2)^2 mod n */ } /* mul += 6; */ } /* computes 2P=(x2:z2) from P=(x1:z1), with 5 muls (3 muls and 2 squares) and 4 add/sub. - n : number to factor - b : (a+2)/4 mod n - t, u, v, w : auxiliary variables */ void duplicate (mpres_t x2, mpres_t z2, mpres_t x1, mpres_t z1, mpmod_t n, mpres_t b, mpres_t u, mpres_t v, mpres_t w) { mpres_add (u, x1, z1, n); mpres_sqr (u, u, n); /* u = (x1+z1)^2 mod n */ mpres_sub (v, x1, z1, n); mpres_sqr (v, v, n); /* v = (x1-z1)^2 mod n */ mpres_mul (x2, u, v, n); /* x2 = u*v = (x1^2 - z1^2)^2 mod n */ mpres_sub (w, u, v, n); /* w = u-v = 4*x1*z1 */ mpres_mul (u, w, b, n); /* u = w*b = ((A+2)/4*(4*x1*z1)) mod n */ mpres_add (u, u, v, n); /* u = (x1-z1)^2+(A+2)/4*(4*x1*z1) */ mpres_mul (z2, w, u, n); /* z2 = ((4*x1*z1)*((x1-z1)^2+(A+2)/4*(4*x1*z1))) mod n */ } /* multiply P=(x:z) by e and puts the result in (x:z). */ void ecm_mul (mpres_t x, mpres_t z, mpz_t e, mpmod_t n, mpres_t b) { size_t l; int negated = 0; mpres_t x0, z0, x1, z1, u, v, w; /* In Montgomery coordinates, the point at infinity is (0::0) */ if (mpz_sgn (e) == 0) { mpz_set_ui (x, 0); mpz_set_ui (z, 0); return; } /* The negative of a point (x:y:z) is (x:-y:u). Since we do not compute y, e*(x::z) == (-e)*(x::z). */ if (mpz_sgn (e) < 0) { negated = 1; mpz_neg (e, e); } if (mpz_cmp_ui (e, 1) == 0) goto ecm_mul_end; MEMORY_TAG; mpres_init (x0, n); MEMORY_TAG; mpres_init (z0, n); MEMORY_TAG; mpres_init (x1, n); MEMORY_TAG; mpres_init (z1, n); MEMORY_TAG; mpres_init (u, n); MEMORY_TAG; mpres_init (v, n); MEMORY_TAG; mpres_init (w, n); MEMORY_UNTAG; l = mpz_sizeinbase (e, 2) - 1; /* l >= 1 */ mpres_set (x0, x, n); mpres_set (z0, z, n); duplicate (x1, z1, x0, z0, n, b, u, v, w); /* invariant: (P1,P0) = ((k+1)P, kP) where k = floor(e/2^l) */ while (l-- > 0) { if (mpz_tstbit (e, l)) /* k, k+1 -> 2k+1, 2k+2 */ { add3 (x0, z0, x0, z0, x1, z1, x, z, n, u, v, w); /* 2k+1 */ duplicate (x1, z1, x1, z1, n, b, u, v, w); /* 2k+2 */ } else /* k, k+1 -> 2k, 2k+1 */ { add3 (x1, z1, x1, z1, x0, z0, x, z, n, u, v, w); /* 2k+1 */ duplicate (x0, z0, x0, z0, n, b, u, v, w); /* 2k */ } } mpres_set (x, x0, n); mpres_set (z, z0, n); mpres_clear (x0, n); mpres_clear (z0, n); mpres_clear (x1, n); mpres_clear (z1, n); mpres_clear (u, n); mpres_clear (v, n); mpres_clear (w, n); ecm_mul_end: /* Undo negation to avoid changing the caller's e value */ if (negated) mpz_neg (e, e); } #define ADD 6.0 /* number of multiplications in an addition */ #define DUP 5.0 /* number of multiplications in a duplicate */ /* returns the number of modular multiplications for computing V_n from V_r * V_{n-r} - V_{n-2r}. ADD is the cost of an addition DUP is the cost of a duplicate */ static double lucas_cost (ecm_uint n, double v) { ecm_uint d, e, r; double c; /* cost */ d = n; r = (ecm_uint) ((double) d * v + 0.5); if (r >= n) return (ADD * (double) n); d = n - r; e = 2 * r - n; c = DUP + ADD; /* initial duplicate and final addition */ while (d != e) { if (d < e) { r = d; d = e; e = r; } if (d - e <= e / 4 && ((d + e) % 3) == 0) { /* condition 1 */ d = (2 * d - e) / 3; e = (e - d) / 2; c += 3.0 * ADD; /* 3 additions */ } else if (d - e <= e / 4 && (d - e) % 6 == 0) { /* condition 2 */ d = (d - e) / 2; c += ADD + DUP; /* one addition, one duplicate */ } else if ((d + 3) / 4 <= e) { /* condition 3 */ d -= e; c += ADD; /* one addition */ } else if ((d + e) % 2 == 0) { /* condition 4 */ d = (d - e) / 2; c += ADD + DUP; /* one addition, one duplicate */ } /* now d+e is odd */ else if (d % 2 == 0) { /* condition 5 */ d /= 2; c += ADD + DUP; /* one addition, one duplicate */ } /* now d is odd and e is even */ else if (d % 3 == 0) { /* condition 6 */ d = d / 3 - e; c += 3.0 * ADD + DUP; /* three additions, one duplicate */ } else if ((d + e) % 3 == 0) { /* condition 7 */ d = (d - 2 * e) / 3; c += 3.0 * ADD + DUP; /* three additions, one duplicate */ } else if ((d - e) % 3 == 0) { /* condition 8 */ d = (d - e) / 3; c += 3.0 * ADD + DUP; /* three additions, one duplicate */ } else /* necessarily e is even: catches all cases */ { /* condition 9 */ e /= 2; c += ADD + DUP; /* one addition, one duplicate */ } } return c; } /* computes kP from P=(xA:zA) and puts the result in (xA:zA). Assumes k>2. WARNING! The calls to add3() assume that the two input points are distinct, which is not neccessarily satisfied. The result can be that in rare cases the point at infinity (z==0) results when it shouldn't. A test case is echo 33554520197234177 | ./ecm -sigma 2046841451 373 1 which finds the prime even though it shouldn't (23^2=529 divides order). This is not a problem for ECM since at worst we'll find a factor we shouldn't have found. For other purposes (i.e. primality proving) this would have to be fixed first. */ static void prac (mpres_t xA, mpres_t zA, ecm_uint k, mpmod_t n, mpres_t b, mpres_t u, mpres_t v, mpres_t w, mpres_t xB, mpres_t zB, mpres_t xC, mpres_t zC, mpres_t xT, mpres_t zT, mpres_t xT2, mpres_t zT2) { ecm_uint d, e, r, i = 0, nv; double c, cmin; __mpz_struct *tmp; #define NV 10 /* 1/val[0] = the golden ratio (1+sqrt(5))/2, and 1/val[i] for i>0 is the real number whose continued fraction expansion is all 1s except for a 2 in i+1-st place */ static double val[NV] = { 0.61803398874989485, 0.72360679774997897, 0.58017872829546410, 0.63283980608870629, 0.61242994950949500, 0.62018198080741576, 0.61721461653440386, 0.61834711965622806, 0.61791440652881789, 0.61807966846989581}; /* for small n, it makes no sense to try 10 different Lucas chains */ nv = mpz_size ((mpz_ptr) n); if (nv > NV) nv = NV; if (nv > 1) { /* chooses the best value of v */ for (d = 0, cmin = ADD * (double) k; d < nv; d++) { c = lucas_cost (k, val[d]); if (c < cmin) { cmin = c; i = d; } } } d = k; r = (ecm_uint) ((double) d * val[i] + 0.5); /* first iteration always begins by Condition 3, then a swap */ d = k - r; e = 2 * r - k; mpres_set (xB, xA, n); mpres_set (zB, zA, n); /* B=A */ mpres_set (xC, xA, n); mpres_set (zC, zA, n); /* C=A */ duplicate (xA, zA, xA, zA, n, b, u, v, w); /* A = 2*A */ while (d != e) { if (d < e) { r = d; d = e; e = r; mpres_swap (xA, xB, n); mpres_swap (zA, zB, n); } /* do the first line of Table 4 whose condition qualifies */ if (d - e <= e / 4 && ((d + e) % 3) == 0) { /* condition 1 */ d = (2 * d - e) / 3; e = (e - d) / 2; add3 (xT, zT, xA, zA, xB, zB, xC, zC, n, u, v, w); /* T = f(A,B,C) */ add3 (xT2, zT2, xT, zT, xA, zA, xB, zB, n, u, v, w); /* T2 = f(T,A,B) */ add3 (xB, zB, xB, zB, xT, zT, xA, zA, n, u, v, w); /* B = f(B,T,A) */ mpres_swap (xA, xT2, n); mpres_swap (zA, zT2, n); /* swap A and T2 */ } else if (d - e <= e / 4 && (d - e) % 6 == 0) { /* condition 2 */ d = (d - e) / 2; add3 (xB, zB, xA, zA, xB, zB, xC, zC, n, u, v, w); /* B = f(A,B,C) */ duplicate (xA, zA, xA, zA, n, b, u, v, w); /* A = 2*A */ } else if ((d + 3) / 4 <= e) { /* condition 3 */ d -= e; add3 (xT, zT, xB, zB, xA, zA, xC, zC, n, u, v, w); /* T = f(B,A,C) */ /* circular permutation (B,T,C) */ tmp = xB; xB = xT; xT = xC; xC = tmp; tmp = zB; zB = zT; zT = zC; zC = tmp; } else if ((d + e) % 2 == 0) { /* condition 4 */ d = (d - e) / 2; add3 (xB, zB, xB, zB, xA, zA, xC, zC, n, u, v, w); /* B = f(B,A,C) */ duplicate (xA, zA, xA, zA, n, b, u, v, w); /* A = 2*A */ } /* now d+e is odd */ else if (d % 2 == 0) { /* condition 5 */ d /= 2; add3 (xC, zC, xC, zC, xA, zA, xB, zB, n, u, v, w); /* C = f(C,A,B) */ duplicate (xA, zA, xA, zA, n, b, u, v, w); /* A = 2*A */ } /* now d is odd, e is even */ else if (d % 3 == 0) { /* condition 6 */ d = d / 3 - e; duplicate (xT, zT, xA, zA, n, b, u, v, w); /* T = 2*A */ add3 (xT2, zT2, xA, zA, xB, zB, xC, zC, n, u, v, w); /* T2 = f(A,B,C) */ add3 (xA, zA, xT, zT, xA, zA, xA, zA, n, u, v, w); /* A = f(T,A,A) */ add3 (xT, zT, xT, zT, xT2, zT2, xC, zC, n, u, v, w); /* T = f(T,T2,C) */ /* circular permutation (C,B,T) */ tmp = xC; xC = xB; xB = xT; xT = tmp; tmp = zC; zC = zB; zB = zT; zT = tmp; } else if ((d + e) % 3 == 0) { /* condition 7 */ d = (d - 2 * e) / 3; add3 (xT, zT, xA, zA, xB, zB, xC, zC, n, u, v, w); /* T = f(A,B,C) */ add3 (xB, zB, xT, zT, xA, zA, xB, zB, n, u, v, w); /* B = f(T,A,B) */ duplicate (xT, zT, xA, zA, n, b, u, v, w); add3 (xA, zA, xA, zA, xT, zT, xA, zA, n, u, v, w); /* A = 3*A */ } else if ((d - e) % 3 == 0) { /* condition 8 */ d = (d - e) / 3; add3 (xT, zT, xA, zA, xB, zB, xC, zC, n, u, v, w); /* T = f(A,B,C) */ add3 (xC, zC, xC, zC, xA, zA, xB, zB, n, u, v, w); /* C = f(A,C,B) */ mpres_swap (xB, xT, n); mpres_swap (zB, zT, n); /* swap B and T */ duplicate (xT, zT, xA, zA, n, b, u, v, w); add3 (xA, zA, xA, zA, xT, zT, xA, zA, n, u, v, w); /* A = 3*A */ } else /* necessarily e is even here */ { /* condition 9 */ e /= 2; add3 (xC, zC, xC, zC, xB, zB, xA, zA, n, u, v, w); /* C = f(C,B,A) */ duplicate (xB, zB, xB, zB, n, b, u, v, w); /* B = 2*B */ } } add3 (xA, zA, xA, zA, xB, zB, xC, zC, n, u, v, w); ASSERT(d == 1); } /* Input: x is initial point A is curve parameter in Montgomery's form: g*y^2*z = x^3 + a*x^2*z + x*z^2 n is the number to factor B1 is the stage 1 bound Output: If a factor is found, it is returned in x. Otherwise, x contains the x-coordinate of the point computed in stage 1 (with z coordinate normalized to 1). B1done is set to B1 if stage 1 completed normally, or to the largest prime processed if interrupted, but never to a smaller value than B1done was upon function entry. Return value: ECM_FACTOR_FOUND_STEP1 if a factor, otherwise ECM_NO_FACTOR_FOUND */ static int ecm_stage1 (mpz_t f, mpres_t x, mpres_t A, mpmod_t n, double B1, double *B1done, mpz_t go, int (*stop_asap)(void), char *chkfilename) { mpres_t b, z, u, v, w, xB, zB, xC, zC, xT, zT, xT2, zT2; double p, r, last_chkpnt_p; int ret = ECM_NO_FACTOR_FOUND; long last_chkpnt_time; MEMORY_TAG; mpres_init (b, n); MEMORY_TAG; mpres_init (z, n); MEMORY_TAG; mpres_init (u, n); MEMORY_TAG; mpres_init (v, n); MEMORY_TAG; mpres_init (w, n); MEMORY_TAG; mpres_init (xB, n); MEMORY_TAG; mpres_init (zB, n); MEMORY_TAG; mpres_init (xC, n); MEMORY_TAG; mpres_init (zC, n); MEMORY_TAG; mpres_init (xT, n); MEMORY_TAG; mpres_init (zT, n); MEMORY_TAG; mpres_init (xT2, n); MEMORY_TAG; mpres_init (zT2, n); MEMORY_UNTAG; last_chkpnt_time = cputime (); mpres_set_ui (z, 1, n); mpres_add_ui (b, A, 2, n); mpres_div_2exp (b, b, 2, n); /* b == (A0+2)*B/4, where B=2^(k*GMP_NUMB_BITS) for MODMULN or REDC, B=1 otherwise */ /* preload group order */ if (go != NULL) ecm_mul (x, z, go, n, b); /* prac() wants multiplicands > 2 */ for (r = 2.0; r <= B1; r *= 2.0) if (r > *B1done) duplicate (x, z, x, z, n, b, u, v, w); /* We'll do 3 manually, too (that's what ecm4 did..) */ for (r = 3.0; r <= B1; r *= 3.0) if (r > *B1done) { duplicate (xB, zB, x, z, n, b, u, v, w); add3 (x, z, x, z, xB, zB, x, z, n, u, v, w); } last_chkpnt_p = 3.; p = getprime (); /* Puts 3.0 into p. Next call gives 5.0 */ for (p = getprime (); p <= B1; p = getprime ()) { for (r = p; r <= B1; r *= p) if (r > *B1done) prac (x, z, (ecm_uint) p, n, b, u, v, w, xB, zB, xC, zC, xT, zT, xT2, zT2); if (mpres_is_zero (z, n)) { outputf (OUTPUT_VERBOSE, "Reached point at infinity, %.0f divides " "group order\n", p); break; } if (stop_asap != NULL && (*stop_asap) ()) { outputf (OUTPUT_NORMAL, "Interrupted at prime %.0f\n", p); break; } if (chkfilename != NULL && p > last_chkpnt_p + 10000. && elltime (last_chkpnt_time, cputime ()) > CHKPNT_PERIOD) { writechkfile (chkfilename, ECM_ECM, MAX(p, *B1done), n, A, x, z); last_chkpnt_p = p; last_chkpnt_time = cputime (); } } /* If stage 1 finished normally, p is the smallest prime >B1 here. In that case, set to B1 */ if (p > B1) p = B1; if (p > *B1done) *B1done = p; if (chkfilename != NULL) writechkfile (chkfilename, ECM_ECM, *B1done, n, A, x, z); getprime_clear (); /* free the prime tables, and reinitialize */ if (!mpres_invert (u, z, n)) /* Factor found? */ { mpres_gcd (f, z, n); ret = ECM_FACTOR_FOUND_STEP1; } mpres_mul (x, x, u, n); mpres_clear (zT2, n); mpres_clear (xT2, n); mpres_clear (zT, n); mpres_clear (xT, n); mpres_clear (zC, n); mpres_clear (xC, n); mpres_clear (zB, n); mpres_clear (xB, n); mpres_clear (w, n); mpres_clear (v, n); mpres_clear (u, n); mpres_clear (z, n); mpres_clear (b, n); return ret; } /* choose "optimal" S according to step 2 range B2 */ int choose_S (mpz_t B2len) { if (mpz_cmp_d (B2len, 1e7) < 0) return 1; /* x^1 */ else if (mpz_cmp_d (B2len, 1e8) < 0) return 2; /* x^2 */ else if (mpz_cmp_d (B2len, 1e9) < 0) return -3; /* Dickson(3) */ else if (mpz_cmp_d (B2len, 1e10) < 0) return -6; /* Dickson(6) */ else if (mpz_cmp_d (B2len, 3e11) < 0) return -12; /* Dickson(12) */ else return -30; /* Dickson(30) */ } #define DIGITS_START 35 #define DIGITS_INCR 5 #define DIGITS_END 80 static void print_expcurves (double B1, const mpz_t B2, unsigned long dF, unsigned long k, int S, int batch) { double prob; int i, j; char sep, outs[128]; for (i = DIGITS_START, j = 0; i <= DIGITS_END; i += DIGITS_INCR, j += 3) sprintf (outs + j, "%2u%c", i, (i < DIGITS_END) ? '\t' : '\n'); outs[j] = '\0'; outputf (OUTPUT_VERBOSE, "Expected number of curves to find a factor " "of n digits:\n%s", outs); for (i = DIGITS_START; i <= DIGITS_END; i += DIGITS_INCR) { sep = (i < DIGITS_END) ? '\t' : '\n'; prob = ecmprob (B1, mpz_get_d (B2), /* in batch mode, the extra smoothness is smaller */ pow (10., i - .5) / ((batch == 1) ? BATCH1_EXTRA_SMOOTHNESS : 1.0), (double) dF * dF * k, S); if (prob > 1. / 10000000) outputf (OUTPUT_VERBOSE, "%.0f%c", floor (1. / prob + .5), sep); else if (prob > 0.) outputf (OUTPUT_VERBOSE, "%.2g%c", floor (1. / prob + .5), sep); else outputf (OUTPUT_VERBOSE, "Inf%c", sep); } } static void print_exptime (double B1, const mpz_t B2, unsigned long dF, unsigned long k, int S, double tottime, int batch) { double prob, exptime; int i, j; char sep, outs[128]; for (i = DIGITS_START, j = 0; i <= DIGITS_END; i += DIGITS_INCR, j += 3) sprintf (outs + j, "%2u%c", i, (i < DIGITS_END) ? '\t' : '\n'); outs[j] = '\0'; outputf (OUTPUT_VERBOSE, "Expected time to find a factor of n digits:\n%s", outs); for (i = DIGITS_START; i <= DIGITS_END; i += DIGITS_INCR) { sep = (i < DIGITS_END) ? '\t' : '\n'; prob = ecmprob (B1, mpz_get_d (B2), /* in batch mode, the extra smoothness is smaller */ pow (10., i - .5) / ((batch == 1) ? BATCH1_EXTRA_SMOOTHNESS : 1.0), (double) dF * dF * k, S); exptime = (prob > 0.) ? tottime / prob : HUGE_VAL; outputf (OUTPUT_TRACE, "Digits: %d, Total time: %.0f, probability: " "%g, expected time: %.0f\n", i, tottime, prob, exptime); if (exptime < 1000.) outputf (OUTPUT_VERBOSE, "%.0fms%c", exptime, sep); else if (exptime < 60000.) /* One minute */ outputf (OUTPUT_VERBOSE, "%.2fs%c", exptime / 1000., sep); else if (exptime < 3600000.) /* One hour */ outputf (OUTPUT_VERBOSE, "%.2fm%c", exptime / 60000., sep); else if (exptime < 86400000.) /* One day */ outputf (OUTPUT_VERBOSE, "%.2fh%c", exptime / 3600000., sep); else if (exptime < 31536000000.) /* One year */ outputf (OUTPUT_VERBOSE, "%.2fd%c", exptime / 86400000., sep); else if (exptime < 31536000000000.) /* One thousand years */ outputf (OUTPUT_VERBOSE, "%.2fy%c", exptime / 31536000000., sep); else if (exptime < 31536000000000000.) /* One million years */ outputf (OUTPUT_VERBOSE, "%.0fy%c", exptime / 31536000000., sep); else if (prob > 0.) outputf (OUTPUT_VERBOSE, "%.1gy%c", exptime / 31536000000., sep); else outputf (OUTPUT_VERBOSE, "Inf%c", sep); } } /* go should be NULL for P+1, and P-1, it contains the y coordinate for the Weierstrass form for ECM (when sigma_is_A = -1). */ void print_B1_B2_poly (int verbosity, int method, double B1, double B1done, mpz_t B2min_param, mpz_t B2min, mpz_t B2, int S, mpz_t x0, int sigma_is_A, mpz_t go) { ASSERT ((method == ECM_ECM) || (go == NULL)); ASSERT ((-1 <= sigma_is_A) && (sigma_is_A <= 1)); if (test_verbose (verbosity)) { outputf (verbosity, "Using "); if (ECM_IS_DEFAULT_B1_DONE(B1done)) outputf (verbosity, "B1=%1.0f, ", B1); else outputf (verbosity, "B1=%1.0f-%1.0f, ", B1done, B1); if (mpz_sgn (B2min_param) < 0) outputf (verbosity, "B2=%Zd", B2); else outputf (verbosity, "B2=%Zd-%Zd", B2min, B2); if (S > 0) outputf (verbosity, ", polynomial x^%u", S); else if (S < 0) outputf (verbosity, ", polynomial Dickson(%u)", -S); /* don't print in resume case, since x0 is saved in resume file */ if (method == ECM_ECM) { if (sigma_is_A == 1) outputf (verbosity, ", A=%Zd", x0); else if (sigma_is_A == 0) outputf (verbosity, ", sigma=%Zd", x0); else /* sigma_is_A = -1: curve was given in Weierstrass form */ outputf (verbosity, ", Weierstrass(A=%Zd,y=Zd)", x0, go); } else if (ECM_IS_DEFAULT_B1_DONE(B1done)) outputf (verbosity, ", x0=%Zd", x0); outputf (verbosity, "\n"); } } /* Input: x is starting point or zero sigma is sigma value (if x is set to zero) or A parameter (if x is non-zero) of curve n is the number to factor go is the initial group order to preload B1, B2 are the stage 1/stage 2 bounds, respectively B2min the lower bound for stage 2 B2scale is the stage 2 scale factor k is the number of blocks to do in stage 2 S is the degree of the Suyama-Brent extension for stage 2 verbose is verbosity level: 0 no output, 1 normal output, 2 diagnostic output. sigma_is_a: If true, the sigma parameter contains the curve's A value Output: f is the factor found. Return value: ECM_FACTOR_FOUND_STEPn if a factor was found, ECM_NO_FACTOR_FOUND if no factor was found, ECM_ERROR in case of error. */ int ecm (mpz_t f, mpz_t x, mpz_t sigma, mpz_t n, mpz_t go, double *B1done, double B1, mpz_t B2min_parm, mpz_t B2_parm, double B2scale, unsigned long k, const int S, int verbose, int repr, int nobase2step2, int use_ntt, int sigma_is_A, FILE *os, FILE* es, char *chkfilename, char *TreeFilename, double maxmem, double stage1time, gmp_randstate_t rng, int (*stop_asap)(void), int batch, mpz_t batch_s, ATTRIBUTE_UNUSED double gw_k, ATTRIBUTE_UNUSED unsigned long gw_b, ATTRIBUTE_UNUSED unsigned long gw_n, ATTRIBUTE_UNUSED signed long gw_c) { int youpi = ECM_NO_FACTOR_FOUND; int base2 = 0; /* If n is of form 2^n[+-]1, set base to [+-]n */ int Fermat = 0; /* If base2 > 0 is a power of 2, set Fermat to base2 */ int po2 = 0; /* Whether we should use power-of-2 poly degree */ long st; mpmod_t modulus; curve P; mpz_t B2min, B2; /* Local B2, B2min to avoid changing caller's values */ unsigned long dF; root_params_t root_params; /* 1: sigma contains A from Montgomery form By^2 = x^3 + Ax^2 + x 0: sigma contains 'sigma' from Suyama's parametrization -1: sigma contains A from Weierstrass form y^2 = x^3 + Ax + B, and go contains B */ ASSERT((-1 <= sigma_is_A) && (sigma_is_A <= 1)); set_verbose (verbose); ECM_STDOUT = (os == NULL) ? stdout : os; ECM_STDERR = (es == NULL) ? stdout : es; #ifdef MPRESN_NO_ADJUSTMENT /* When no adjustment is made in mpresn_ functions, N should be smaller than B^n/16 */ if (mpz_sizeinbase (n, 2) > mpz_size (n) * GMP_NUMB_BITS - 4) { outputf (OUTPUT_ERROR, "Error, N should be smaller than B^n/16\n"); return ECM_ERROR; } #endif /* In batch mode, we force MODMULN */ if (batch) repr = ECM_MOD_MODMULN; /* if n is even, return 2 */ if (mpz_divisible_2exp_p (n, 1)) { mpz_set_ui (f, 2); return ECM_FACTOR_FOUND_STEP1; } /* now n is odd */ /* check that B1 is not too large */ if (B1 > (double) ECM_UINT_MAX) { outputf (OUTPUT_ERROR, "Error, maximal step 1 bound for ECM is %lu.\n", ECM_UINT_MAX); return ECM_ERROR; } st = cputime (); if (mpmod_init (modulus, n, repr) != 0) return ECM_ERROR; /* See what kind of number we have as that may influence optimal parameter selection. Test for base 2 number. Note: this was already done by mpmod_init. */ if (modulus->repr == ECM_MOD_BASE2) base2 = modulus->bits; /* For a Fermat number (base2 a positive power of 2) */ for (Fermat = base2; Fermat > 0 && (Fermat & 1) == 0; Fermat >>= 1); if (Fermat == 1) { Fermat = base2; po2 = 1; } else Fermat = 0; MEMORY_TAG; mpres_init (P.x, modulus); MEMORY_TAG; mpres_init (P.y, modulus); MEMORY_TAG; mpres_init (P.A, modulus); mpres_set_z (P.x, x, modulus); mpres_set_ui (P.y, 1, modulus); MEMORY_TAG; mpz_init_set (B2min, B2min_parm); MEMORY_TAG; mpz_init_set (B2, B2_parm); MEMORY_TAG; mpz_init (root_params.i0); MEMORY_UNTAG; /* set second stage bound B2: when using polynomial multiplication of complexity n^alpha, stage 2 has complexity about B2^(alpha/2), and we want stage 2 to take about half of stage 1, thus we choose B2 = (c*B1)^(2/alpha). Experimentally, c=1/4 seems to work well. For Toom-Cook 3, this gives alpha=log(5)/log(3), and B2 ~ (c*B1)^1.365. For Toom-Cook 4, this gives alpha=log(7)/log(4), and B2 ~ (c*B1)^1.424. */ /* We take the cost of P+1 stage 1 to be about twice that of P-1. Since nai"ve P+1 and ECM cost respectively 2 and 11 multiplies per addition and duplicate, and both are optimized with PRAC, we can assume the ratio remains about 11/2. */ /* Also scale B2 by what the user said (or by the default scaling of 1.0) */ if (ECM_IS_DEFAULT_B2(B2)) mpz_set_d (B2, B2scale * pow (ECM_COST * B1, DEFAULT_B2_EXPONENT)); /* set B2min */ if (mpz_sgn (B2min) < 0) mpz_set_d (B2min, B1); /* Let bestD determine parameters for root generation and the effective B2 */ if (use_ntt) po2 = 1; root_params.d2 = 0; /* Enable automatic choice of d2 */ if (bestD (&root_params, &k, &dF, B2min, B2, po2, use_ntt, maxmem, (TreeFilename != NULL), modulus) == ECM_ERROR) { youpi = ECM_ERROR; goto end_of_ecm; } /* Set default degree for Brent-Suyama extension */ /* We try to keep the time used by the Brent-Suyama extension at about 10% of the stage 2 time */ /* Degree S Dickson polys and x^S are equally fast for ECM, so we go for the better Dickson polys whenever possible. For S == 1, 2, they behave identically. */ root_params.S = S; if (root_params.S == ECM_DEFAULT_S) { if (Fermat > 0) { /* For Fermat numbers, default is 1 (no Brent-Suyama) */ root_params.S = 1; } else { mpz_t t; MEMORY_TAG; mpz_init (t); MEMORY_UNTAG; mpz_sub (t, B2, B2min); root_params.S = choose_S (t); mpz_clear (t); } } if (sigma_is_A == 0) { /* if sigma=0, generate it at random */ if (mpz_sgn (sigma) == 0) { mpz_urandomb (sigma, rng, 32); mpz_add_ui (sigma, sigma, 6); } /* sigma contains sigma value, A and x values must be computed */ youpi = get_curve_from_sigma (f, P.A, P.x, sigma, modulus); if (youpi != ECM_NO_FACTOR_FOUND) goto end_of_ecm; } else if (sigma_is_A == 1 && batch == 1) { if (mpz_sgn (sigma) == 0) { int i; /* We choose a positive integer d' smaller than B=2^GMP_NUMB_BITS and consider d = d'/B and A = 4d-2 */ do mpz_urandomb (sigma, rng, 32); /* generates d' <> 0 */ while (mpz_sgn (sigma) == 0); ASSERT((GMP_NUMB_BITS % 2) == 0); if (GMP_NUMB_BITS >= 64) mpz_mul (sigma, sigma, sigma); /* ensures d' (and thus d) is a square, which increases the success probability */ /* divide d' by B to get d */ for (i = 0; i < GMP_NUMB_BITS; i++) { if (mpz_tstbit (sigma, 0) == 1) mpz_add (sigma, sigma, n); mpz_div_2exp (sigma, sigma, 1); } mpz_mul_2exp (sigma, sigma, 2); /* 4d */ mpz_sub_ui (sigma, sigma, 2); /* 4d-2 */ } mpres_set_z (P.A, sigma, modulus); } else if (sigma_is_A == 1 && batch == 2) { if (mpz_sgn (sigma) == 0) { mpz_urandomb (sigma, rng, 32); mpz_add_ui (sigma, sigma, 2); youpi = get_curve_from_ell_parametrization (f, P.A, sigma, modulus); mpres_get_z (sigma, P.A, modulus); if (youpi != ECM_NO_FACTOR_FOUND) goto end_of_ecm; } else /* sigma contains the A value */ mpres_set_z (P.A, sigma, modulus); } else if (sigma_is_A == 1) { /* sigma contains the A value */ mpres_set_z (P.A, sigma, modulus); /* TODO: make a valid, random starting point in case none was given */ /* Problem: this may be as hard as factoring as we'd need to determine whether x^3 + a*x^2 + x is a quadratic residue or not */ /* For now, we'll just chicken out. */ if (mpz_sgn (x) == 0) { outputf (OUTPUT_ERROR, "Error, -A requires a starting point (-x0 x).\n"); youpi = ECM_ERROR; goto end_of_ecm; } } /* If a nonzero value is given in x, then we use it as the starting point, overwriting the one computing from sigma for sigma_is_A=0. */ if (mpz_sgn (x) != 0) mpres_set_z (P.x, x, modulus); /* Print B1, B2, polynomial and sigma */ print_B1_B2_poly (OUTPUT_NORMAL, ECM_ECM, B1, *B1done, B2min_parm, B2min, B2, root_params.S, sigma, sigma_is_A, go); #if 0 outputf (OUTPUT_VERBOSE, "b2=%1.0f, dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", b2, dF, k, root_params.d1, root_params.d2, root_params.i0); #else outputf (OUTPUT_VERBOSE, "dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n", dF, k, root_params.d1, root_params.d2, root_params.i0); #endif if (sigma_is_A == -1) /* Weierstrass form: we perform only Stage 2, since all curves in Weierstrass form do not admit a Montgomery form. */ { mpres_set_z (P.A, sigma, modulus); /* sigma contains A */ mpres_set_z (P.y, go, modulus); /* go contains y */ if (mpz_sgn (x) == 0 || mpz_sgn (go) == 0) { outputf (OUTPUT_ERROR, "Error, sigma_is_A=-1 requires x and y.\n"); youpi = ECM_ERROR; goto end_of_ecm; } goto hecm; } if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_t t; MEMORY_TAG; mpz_init (t); MEMORY_UNTAG; mpres_get_z (t, P.A, modulus); outputf (OUTPUT_RESVERBOSE, "A=%Zd\n", t); mpres_get_z (t, P.x, modulus); outputf (OUTPUT_RESVERBOSE, "starting point: x0=%Zd\n", t); mpz_clear (t); } if (go != NULL && mpz_cmp_ui (go, 1) > 0) outputf (OUTPUT_VERBOSE, "initial group order: %Zd\n", go); if (test_verbose (OUTPUT_VERBOSE)) { if (mpz_cmp_d (B2min, B1) != 0) { outputf (OUTPUT_VERBOSE, "Can't compute success probabilities for B1 <> B2min\n"); } else { rhoinit (256, 10); print_expcurves (B1, B2, dF, k, root_params.S, batch); } } #ifdef HAVE_GWNUM /* We will only use GWNUM for numbers of the form k*b^n+c */ if (gw_b != 0 && B1 >= *B1done && batch == 0) youpi = gw_ecm_stage1 (f, &P, modulus, B1, B1done, go, gw_k, gw_b, gw_n, gw_c); /* At this point B1 == *B1done unless interrupted, or no GWNUM ecm_stage1 is available */ if (youpi != ECM_NO_FACTOR_FOUND) goto end_of_ecm_rhotable; #endif if (B1 > *B1done) { if (batch != 0) /* FIXME: go, stop_asap and chkfilename are ignored in batch mode */ youpi = ecm_stage1_batch (f, P.x, P.A, modulus, B1, B1done, batch, batch_s); else youpi = ecm_stage1 (f, P.x, P.A, modulus, B1, B1done, go, stop_asap, chkfilename); } if (stage1time > 0.) { const long st2 = elltime (st, cputime ()); const long s1t = (long) (stage1time * 1000.); outputf (OUTPUT_NORMAL, "Step 1 took %ldms (%ld in this run, %ld from previous runs)\n", st2 + s1t, st2, s1t); } else outputf (OUTPUT_NORMAL, "Step 1 took %ldms\n", elltime (st, cputime ())); /* Store end-of-stage-1 residue in x in case we write it to a save file, before P.x is converted to Weierstrass form */ mpres_get_z (x, P.x, modulus); if (youpi != ECM_NO_FACTOR_FOUND) goto end_of_ecm_rhotable; if (test_verbose (OUTPUT_RESVERBOSE)) { mpz_t t; MEMORY_TAG; mpz_init (t); MEMORY_UNTAG; mpres_get_z (t, P.x, modulus); outputf (OUTPUT_RESVERBOSE, "x=%Zd\n", t); mpz_clear (t); } /* In case of a signal, we'll exit after the residue is printed. If no save file is specified, the user may still resume from the residue */ if (stop_asap != NULL && (*stop_asap) ()) goto end_of_ecm_rhotable; /* If using 2^k +/-1 modulus and 'nobase2step2' flag is set, set default (-nobase2) modular method and remap P.x, P.y, and P.A */ if (modulus->repr == ECM_MOD_BASE2 && nobase2step2) { mpz_t x_t, y_t, A_t; MEMORY_TAG; mpz_init (x_t); MEMORY_UNTAG; MEMORY_TAG; mpz_init (y_t); MEMORY_UNTAG; MEMORY_TAG; mpz_init (A_t); MEMORY_UNTAG; mpz_mod (x_t, P.x, modulus->orig_modulus); mpz_mod (y_t, P.y, modulus->orig_modulus); mpz_mod (A_t, P.A, modulus->orig_modulus); mpmod_clear (modulus); repr = ECM_MOD_NOBASE2; if (mpmod_init (modulus, n, repr) != 0) /* reset modulus for nobase2 */ return ECM_ERROR; /* remap x, y, and A for new modular method */ mpres_set_z (P.x, x_t, modulus); mpres_set_z (P.y, y_t, modulus); mpres_set_z (P.A, A_t, modulus); mpz_clear (x_t); mpz_clear (y_t); mpz_clear (A_t); } youpi = montgomery_to_weierstrass (f, P.x, P.y, P.A, modulus); hecm: if (test_verbose (OUTPUT_RESVERBOSE) && youpi == ECM_NO_FACTOR_FOUND && mpz_cmp (B2, B2min) >= 0) { mpz_t t; MEMORY_TAG; mpz_init (t); MEMORY_UNTAG; mpres_get_z (t, P.x, modulus); outputf (OUTPUT_RESVERBOSE, "After switch to Weierstrass form, " "P=(%Zd", t); mpres_get_z (t, P.y, modulus); outputf (OUTPUT_RESVERBOSE, ", %Zd)\n", t); mpres_get_z (t, P.A, modulus); outputf (OUTPUT_RESVERBOSE, "on curve Y^2 = X^3 + %Zd * X + b\n", t); mpz_clear (t); } if (youpi == ECM_NO_FACTOR_FOUND && mpz_cmp (B2, B2min) >= 0) youpi = stage2 (f, &P, modulus, dF, k, &root_params, ECM_ECM, use_ntt, TreeFilename, stop_asap); end_of_ecm_rhotable: if (test_verbose (OUTPUT_VERBOSE)) { if (mpz_cmp_d (B2min, B1) == 0) { if (youpi == ECM_NO_FACTOR_FOUND && (stop_asap == NULL || !(*stop_asap)())) print_exptime (B1, B2, dF, k, root_params.S, (long) (stage1time * 1000.) + elltime (st, cputime ()), batch); rhoinit (1, 0); /* Free memory of rhotable */ } } end_of_ecm: mpres_clear (P.A, modulus); mpres_clear (P.y, modulus); mpres_clear (P.x, modulus); mpmod_clear (modulus); mpz_clear (root_params.i0); mpz_clear (B2); mpz_clear (B2min); return youpi; } ecm-6.4.4/ntt_gfp.c0000644023561000001540000004576112106741273011040 00000000000000/* ntt_gfp.c - low-level radix-2 dif/dit ntt routines over GF(p) Copyright 2005, 2006, 2007, 2008, 2009 Dave Newman, Jason Papadopoulos, Brian Gladman, Alexander Kruppa, Paul Zimmermann. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "sp.h" #include "ecm-impl.h" /*--------------------------- FORWARD NTT --------------------------------*/ static void bfly_dif(spv_t x0, spv_t x1, spv_t w, spv_size_t len, sp_t p, sp_t d) { spv_size_t i = 0; #if (defined(__GNUC__) || defined(__ICL)) && \ defined(__i386__) && defined(HAVE_SSE2) asm volatile ( "movd %6, %%xmm6 \n\t" "pshufd $0x44, %%xmm6, %%xmm5 \n\t" "pshufd $0, %%xmm6, %%xmm6 \n\t" "movd %7, %%xmm7 \n\t" "pshufd $0, %%xmm7, %%xmm7 \n\t" "0: \n\t" "movdqa (%1,%4,4), %%xmm0 \n\t" "movdqa (%2,%4,4), %%xmm1 \n\t" "movdqa %%xmm1, %%xmm2 \n\t" "paddd %%xmm0, %%xmm1 \n\t" "psubd %%xmm2, %%xmm0 \n\t" "psubd %%xmm6, %%xmm1 \n\t" "pxor %%xmm2, %%xmm2 \n\t" "pcmpgtd %%xmm1, %%xmm2 \n\t" "pand %%xmm6, %%xmm2 \n\t" "paddd %%xmm2, %%xmm1 \n\t" "movdqa %%xmm1, (%1,%4,4) \n\t" "pxor %%xmm2, %%xmm2 \n\t" "pcmpgtd %%xmm0, %%xmm2 \n\t" "pand %%xmm6, %%xmm2 \n\t" "paddd %%xmm2, %%xmm0 \n\t" "movdqa (%3,%4,4), %%xmm2 \n\t" "addl $4, %4 \n\t" /* INC */ "pshufd $0x31, %%xmm0, %%xmm1\n\t" "pshufd $0x31, %%xmm2, %%xmm3\n\t" "pmuludq %%xmm2, %%xmm0 \n\t" "pmuludq %%xmm3, %%xmm1 \n\t" "movdqa %%xmm0, %%xmm2 \n\t" "movdqa %%xmm1, %%xmm3 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm2 \n\t" "pmuludq %%xmm7, %%xmm2 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm3 \n\t" "pmuludq %%xmm7, %%xmm3 \n\t" #if SP_NUMB_BITS < W_TYPE_SIZE - 1 "psrlq $33, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "psrlq $33, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" #else "pshufd $0xf5, %%xmm2, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "pshufd $0xf5, %%xmm3, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" "psubq %%xmm5, %%xmm0 \n\t" "psubq %%xmm5, %%xmm1 \n\t" "pshufd $0xf5, %%xmm0, %%xmm2 \n\t" "pshufd $0xf5, %%xmm1, %%xmm3 \n\t" "pand %%xmm5, %%xmm2 \n\t" "pand %%xmm5, %%xmm3 \n\t" "paddq %%xmm2, %%xmm0 \n\t" "paddq %%xmm3, %%xmm1 \n\t" #endif "pshufd $0x8, %%xmm0, %%xmm0 \n\t" "pshufd $0x8, %%xmm1, %%xmm1 \n\t" "punpckldq %%xmm1, %%xmm0 \n\t" "psubd %%xmm6, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" "pcmpgtd %%xmm0, %%xmm1 \n\t" "pand %%xmm6, %%xmm1 \n\t" "paddd %%xmm1, %%xmm0 \n\t" "movdqa %%xmm0, -16(%2,%4,4) \n\t" "cmpl %5, %4 \n\t" "jne 0b \n\t" :"=r"(i) :"r"(x0), "r"(x1), "r"(w), "0"(i), "g"(len), "g"(p), "g"(d) :"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"); #elif defined( _MSC_VER ) && defined( SSE2) __asm { push esi push edi mov edi, x0 mov esi, x1 mov edx, w xor ecx, ecx mov eax, len movd xmm6, p pshufd xmm5, xmm6, 0x44 pshufd xmm6, xmm6, 0 movd xmm7, d pshufd xmm7, xmm7, 0 L0: movdqa xmm0, [edi+ecx*4] movdqa xmm1, [esi+ecx*4] movdqa xmm2, xmm1 paddd xmm1, xmm0 psubd xmm0, xmm2 psubd xmm1, xmm6 pxor xmm2, xmm2 pcmpgtd xmm2, xmm1 pand xmm2, xmm6 paddd xmm1, xmm2 movdqa [edi+ecx*4], xmm1 pxor xmm2, xmm2 pcmpgtd xmm2, xmm0 pand xmm2, xmm6 paddd xmm0, xmm2 movdqa xmm2, [edx+ecx*4] add ecx, 4 pshufd xmm1, xmm0, 0x31 pshufd xmm3, xmm2, 0x31 pmuludq xmm0, xmm2 pmuludq xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 psrlq xmm2, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm2, xmm7 psrlq xmm3, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm3, xmm7 #if SP_NUMB_BITS < W_TYPE_SIZE - 1 psrlq xmm2, 33 pmuludq xmm2, xmm6 psrlq xmm3, 33 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 #else pshufd xmm2, xmm2, 0xf5 pmuludq xmm2, xmm6 pshufd xmm3, xmm3, 0xf5 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 psubq xmm0, xmm5 psubq xmm1, xmm5 pshufd xmm2, xmm0, 0xf5 pshufd xmm3, xmm1, 0xf5 pand xmm2, xmm5 pand xmm3, xmm5 paddq xmm0, xmm2 paddq xmm1, xmm3 #endif pshufd xmm0, xmm0, 0x8 pshufd xmm1, xmm1, 0x8 punpckldq xmm0, xmm1 psubd xmm0, xmm6 pxor xmm1, xmm1 pcmpgtd xmm1, xmm0 pand xmm1, xmm6 paddd xmm0, xmm1 movdqa [esi+ecx*4-16], xmm0 cmp eax, ecx jne L0 pop edi pop esi } #else for (i = 0; i < len; i++) { sp_t w0 = w[i]; sp_t t0 = x0[i]; sp_t t1 = x1[i]; sp_t t2, t3; t2 = sp_add (t0, t1, p); t3 = sp_sub (t0, t1, p); t3 = sp_mul (t3, w0, p, d); x0[i] = t2; x1[i] = t3; } #endif } static void spv_ntt_dif_core (spv_t x, spv_t w, spv_size_t log2_len, sp_t p, sp_t d) { spv_size_t len; spv_t x0, x1; /* handle small transforms immediately */ switch (log2_len) { case 0: return; case 1: { sp_t t0 = x[0]; sp_t t1 = x[1]; x[0] = sp_add (t0, t1, p); x[1] = sp_sub (t0, t1, p); return; } case 2: { sp_t t0 = x[0]; sp_t t1 = x[1]; sp_t t2 = x[2]; sp_t t3 = x[3]; sp_t t4, t5, t6, t7; t4 = sp_add (t0, t2, p); t6 = sp_sub (t0, t2, p); t5 = sp_add (t1, t3, p); t7 = sp_sub (t1, t3, p); x[0] = sp_add (t4, t5, p); x[1] = sp_sub (t4, t5, p); t7 = sp_mul (t7, w[1], p, d); x[2] = sp_add (t6, t7, p); x[3] = sp_sub (t6, t7, p); return; } case 3: { sp_t t0 = x[0]; sp_t t1 = x[1]; sp_t t2 = x[2]; sp_t t3 = x[3]; sp_t t4 = x[4]; sp_t t5 = x[5]; sp_t t6 = x[6]; sp_t t7 = x[7]; sp_t t8, t9, t10, t11, t12, t13, t14, t15; t8 = sp_add (t0, t4, p); t12 = sp_sub (t0, t4, p); t9 = sp_add (t1, t5, p); t13 = sp_sub (t1, t5, p); t13 = sp_mul (t13, w[1], p, d); t10 = sp_add (t2, t6, p); t14 = sp_sub (t2, t6, p); t14 = sp_mul (t14, w[2], p, d); t11 = sp_add (t3, t7, p); t15 = sp_sub (t3, t7, p); t15 = sp_mul (t15, w[3], p, d); t0 = sp_add (t8, t10, p); t2 = sp_sub (t8, t10, p); t1 = sp_add (t9, t11, p); t3 = sp_sub (t9, t11, p); t3 = sp_mul (t3, w[2], p, d); x[0] = sp_add (t0, t1, p); x[1] = sp_sub (t0, t1, p); x[2] = sp_add (t2, t3, p); x[3] = sp_sub (t2, t3, p); t0 = sp_add (t12, t14, p); t2 = sp_sub (t12, t14, p); t1 = sp_add (t13, t15, p); t3 = sp_sub (t13, t15, p); t3 = sp_mul (t3, w[2], p, d); x[4] = sp_add (t0, t1, p); x[5] = sp_sub (t0, t1, p); x[6] = sp_add (t2, t3, p); x[7] = sp_sub (t2, t3, p); return; } } len = 1 << (log2_len - 1); x0 = x; x1 = x + len; bfly_dif (x0, x1, w, len, p, d); spv_ntt_dif_core (x0, w + len, log2_len - 1, p, d); spv_ntt_dif_core (x1, w + len, log2_len - 1, p, d); } void spv_ntt_gfp_dif (spv_t x, spv_size_t log2_len, spm_t data) { sp_t p = data->sp; sp_t d = data->mul_c; if (log2_len <= NTT_GFP_TWIDDLE_DIF_BREAKOVER) { spv_t w = data->nttdata->twiddle + data->nttdata->twiddle_size - (1 << log2_len); spv_ntt_dif_core (x, w, log2_len, p, d); } else { /* recursive version for data that doesn't fit in the L1 cache */ spv_size_t len = 1 << (log2_len - 1); spv_t x0 = x; spv_t x1 = x + len; spv_t roots = data->nttdata->ntt_roots; { spv_size_t i; spv_size_t block_size = MIN(len, MAX_NTT_BLOCK_SIZE); sp_t root = roots[log2_len]; spv_t w = data->scratch; w[0] = 1; for (i = 1; i < block_size; i++) w[i] = sp_mul (w[i-1], root, p, d); root = sp_pow (root, block_size, p, d); for (i = 0; i < len; i += block_size) { if (i) spv_mul_sp (w, w, root, block_size, p, d); bfly_dif (x0 + i, x1 + i, w, block_size, p, d); } } spv_ntt_gfp_dif (x0, log2_len - 1, data); spv_ntt_gfp_dif (x1, log2_len - 1, data); } } /*--------------------------- INVERSE NTT --------------------------------*/ static inline void bfly_dit(spv_t x0, spv_t x1, spv_t w, spv_size_t len, sp_t p, sp_t d) { spv_size_t i = 0; #if (defined(__GNUC__) || defined(__ICL)) && \ defined(__i386__) && defined(HAVE_SSE2) asm volatile ( "movd %6, %%xmm6 \n\t" "pshufd $0x44, %%xmm6, %%xmm5 \n\t" "pshufd $0, %%xmm6, %%xmm6 \n\t" "movd %7, %%xmm7 \n\t" "pshufd $0, %%xmm7, %%xmm7 \n\t" "0: \n\t" "movdqa (%2,%4,4), %%xmm0 \n\t" "movdqa (%3,%4,4), %%xmm2 \n\t" "pshufd $0x31, %%xmm0, %%xmm1\n\t" "pshufd $0x31, %%xmm2, %%xmm3\n\t" "pmuludq %%xmm2, %%xmm0 \n\t" "pmuludq %%xmm3, %%xmm1 \n\t" "movdqa %%xmm0, %%xmm2 \n\t" "movdqa %%xmm1, %%xmm3 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm2 \n\t" "pmuludq %%xmm7, %%xmm2 \n\t" "psrlq $" STRING((2*SP_NUMB_BITS - W_TYPE_SIZE)) ", %%xmm3 \n\t" "pmuludq %%xmm7, %%xmm3 \n\t" #if SP_NUMB_BITS < W_TYPE_SIZE - 1 "psrlq $33, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "psrlq $33, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" #else "pshufd $0xf5, %%xmm2, %%xmm2 \n\t" "pmuludq %%xmm6, %%xmm2 \n\t" "pshufd $0xf5, %%xmm3, %%xmm3 \n\t" "pmuludq %%xmm6, %%xmm3 \n\t" "psubq %%xmm2, %%xmm0 \n\t" "psubq %%xmm3, %%xmm1 \n\t" "psubq %%xmm5, %%xmm0 \n\t" "psubq %%xmm5, %%xmm1 \n\t" "pshufd $0xf5, %%xmm0, %%xmm2 \n\t" "pshufd $0xf5, %%xmm1, %%xmm3 \n\t" "pand %%xmm5, %%xmm2 \n\t" "pand %%xmm5, %%xmm3 \n\t" "paddq %%xmm2, %%xmm0 \n\t" "paddq %%xmm3, %%xmm1 \n\t" #endif "pshufd $0x8, %%xmm0, %%xmm0 \n\t" "pshufd $0x8, %%xmm1, %%xmm1 \n\t" "punpckldq %%xmm1, %%xmm0 \n\t" "psubd %%xmm6, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" "pcmpgtd %%xmm0, %%xmm1 \n\t" "pand %%xmm6, %%xmm1 \n\t" "paddd %%xmm0, %%xmm1 \n\t" "movdqa (%1,%4,4), %%xmm0 \n\t" "movdqa %%xmm1, %%xmm2 \n\t" "paddd %%xmm0, %%xmm1 \n\t" "psubd %%xmm2, %%xmm0 \n\t" "psubd %%xmm6, %%xmm1 \n\t" "pxor %%xmm2, %%xmm2 \n\t" "pcmpgtd %%xmm1, %%xmm2 \n\t" "pand %%xmm6, %%xmm2 \n\t" "paddd %%xmm2, %%xmm1 \n\t" "movdqa %%xmm1, (%1,%4,4) \n\t" "pxor %%xmm2, %%xmm2 \n\t" "pcmpgtd %%xmm0, %%xmm2 \n\t" "pand %%xmm6, %%xmm2 \n\t" "paddd %%xmm2, %%xmm0 \n\t" "movdqa %%xmm0, (%2,%4,4) \n\t" "addl $4, %4 \n\t" /* INC */ "cmpl %5, %4 \n\t" "jne 0b \n\t" :"=r"(i) :"r"(x0), "r"(x1), "r"(w), "0"(i), "g"(len), "g"(p), "g"(d) :"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"); #elif defined( _MSC_VER ) && defined( SSE2) __asm { push esi push edi mov edi, x0 mov esi, x1 mov edx, w xor ecx, ecx mov eax, len movd xmm6, p pshufd xmm5, xmm6, 0x44 pshufd xmm6, xmm6, 0 movd xmm7, d pshufd xmm7, xmm7, 0 L0: movdqa xmm0, [esi+ecx*4] movdqa xmm2, [edx+ecx*4] pshufd xmm1, xmm0, 0x31 pshufd xmm3, xmm2, 0x31 pmuludq xmm0, xmm2 pmuludq xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 psrlq xmm2, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm2, xmm7 psrlq xmm3, 2*SP_NUMB_BITS - W_TYPE_SIZE pmuludq xmm3, xmm7 #if SP_NUMB_BITS < W_TYPE_SIZE - 1 psrlq xmm2, 33 pmuludq xmm2, xmm6 psrlq xmm3, 33 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 #else pshufd xmm2, xmm2, 0xf5 pmuludq xmm2, xmm6 pshufd xmm3, xmm3, 0xf5 pmuludq xmm3, xmm6 psubq xmm0, xmm2 psubq xmm1, xmm3 psubq xmm0, xmm5 psubq xmm1, xmm5 pshufd xmm2, xmm0, 0xf5 pshufd xmm3, xmm1, 0xf5 pand xmm2, xmm5 pand xmm3, xmm5 paddq xmm0, xmm2 paddq xmm1, xmm3 #endif pshufd xmm0, xmm0, 0x8 pshufd xmm1, xmm1, 0x8 punpckldq xmm0, xmm1 psubd xmm0, xmm6 pxor xmm1, xmm1 pcmpgtd xmm1, xmm0 pand xmm1, xmm6 paddd xmm1, xmm0 movdqa xmm0, [edi+ecx*4] movdqa xmm2, xmm1 paddd xmm1, xmm0 psubd xmm0, xmm2 psubd xmm1, xmm6 pxor xmm2, xmm2 pcmpgtd xmm2, xmm1 pand xmm2, xmm6 paddd xmm1, xmm2 movdqa [edi+ecx*4], xmm1 pxor xmm2, xmm2 pcmpgtd xmm2, xmm0 pand xmm2, xmm6 paddd xmm0, xmm2 movdqa [esi+ecx*4], xmm0 add ecx, 4 cmp eax, ecx jne L0 pop edi pop esi } #else for (i = 0; i < len; i++) { sp_t w0 = w[i]; sp_t t0 = x0[i]; sp_t t1 = x1[i]; t1 = sp_mul (t1, w0, p, d); x0[i] = sp_add (t0, t1, p); x1[i] = sp_sub (t0, t1, p); } #endif } static void spv_ntt_dit_core (spv_t x, spv_t w, spv_size_t log2_len, sp_t p, sp_t d) { spv_size_t len; spv_t x0, x1; /* handle small transforms immediately */ switch (log2_len) { case 0: return; case 1: { sp_t t0 = x[0]; sp_t t1 = x[1]; x[0] = sp_add (t0, t1, p); x[1] = sp_sub (t0, t1, p); return; } case 2: { sp_t t0 = x[0]; sp_t t1 = x[1]; sp_t t2 = x[2]; sp_t t3 = x[3]; sp_t t4, t5, t6, t7; t4 = sp_add (t0, t1, p); t5 = sp_sub (t0, t1, p); t6 = sp_add (t2, t3, p); t7 = sp_sub (t2, t3, p); x[0] = sp_add (t4, t6, p); x[2] = sp_sub (t4, t6, p); t7 = sp_mul (t7, w[1], p, d); x[1] = sp_add (t5, t7, p); x[3] = sp_sub (t5, t7, p); return; } case 3: { sp_t t0 = x[0]; sp_t t1 = x[1]; sp_t t2 = x[2]; sp_t t3 = x[3]; sp_t t4 = x[4]; sp_t t5 = x[5]; sp_t t6 = x[6]; sp_t t7 = x[7]; sp_t t8, t9, t10, t11; t8 = sp_add(t0, t1, p); t9 = sp_sub(t0, t1, p); t10 = sp_add(t2, t3, p); t11 = sp_sub(t2, t3, p); t0 = sp_add(t8, t10, p); t2 = sp_sub(t8, t10, p); t11 = sp_mul (t11, w[2], p, d); t1 = sp_add(t9, t11, p); t3 = sp_sub(t9, t11, p); t8 = sp_add(t4, t5, p); t9 = sp_sub(t4, t5, p); t10 = sp_add(t6, t7, p); t11 = sp_sub(t6, t7, p); t4 = sp_add(t8, t10, p); t6 = sp_sub(t8, t10, p); t11 = sp_mul (t11, w[2], p, d); t5 = sp_add(t9, t11, p); t7 = sp_sub(t9, t11, p); x[0] = sp_add(t0, t4, p); x[4] = sp_sub(t0, t4, p); t5 = sp_mul (t5, w[1], p, d); x[1] = sp_add(t1, t5, p); x[5] = sp_sub(t1, t5, p); t6 = sp_mul (t6, w[2], p, d); x[2] = sp_add(t2, t6, p); x[6] = sp_sub(t2, t6, p); t7 = sp_mul (t7, w[3], p, d); x[3] = sp_add(t3, t7, p); x[7] = sp_sub(t3, t7, p); return; } } len = 1 << (log2_len - 1); x0 = x; x1 = x + len; spv_ntt_dit_core (x0, w + len, log2_len - 1, p, d); spv_ntt_dit_core (x1, w + len, log2_len - 1, p, d); bfly_dit (x0, x1, w, len, p, d); } void spv_ntt_gfp_dit (spv_t x, spv_size_t log2_len, spm_t data) { sp_t p = data->sp; sp_t d = data->mul_c; if (log2_len <= NTT_GFP_TWIDDLE_DIT_BREAKOVER) { spv_t w = data->inttdata->twiddle + data->inttdata->twiddle_size - (1 << log2_len); spv_ntt_dit_core (x, w, log2_len, p, d); } else { spv_size_t len = 1 << (log2_len - 1); spv_t x0 = x; spv_t x1 = x + len; spv_t roots = data->inttdata->ntt_roots; spv_ntt_gfp_dit (x0, log2_len - 1, data); spv_ntt_gfp_dit (x1, log2_len - 1, data); { spv_size_t i; spv_size_t block_size = MIN(len, MAX_NTT_BLOCK_SIZE); sp_t root = roots[log2_len]; spv_t w = data->scratch; w[0] = 1; for (i = 1; i < block_size; i++) w[i] = sp_mul (w[i-1], root, p, d); root = sp_pow (root, block_size, p, d); for (i = 0; i < len; i += block_size) { if (i) spv_mul_sp (w, w, root, block_size, p, d); bfly_dit (x0 + i, x1 + i, w, block_size, p, d); } } } } ecm-6.4.4/ks-multiply.c0000644023561000001540000002736712106741273011673 00000000000000/* Polynomial multiplication using GMP's integer multiplication code Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010 Dave Newman, Paul Zimmermann, Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "ecm-gmp.h" /* for MPZ_REALLOC and MPN_COPY */ #include "ecm-impl.h" #define FFT_WRAP /* always defined since mpn_mul_fft is included */ /* Puts in R[0..2l-2] the product of A[0..l-1] and B[0..l-1]. T must have as much space as for toomcook4 (it is only used when that function is called). Notes: - this code aligns the coeffs at limb boundaries - if instead we aligned at byte boundaries then we could save up to 3*l bytes in T0 and T1, but tests have shown this doesn't give any significant speed increase, even for large degree polynomials. - this code requires that all coefficients A[] and B[] are nonnegative. */ void kronecker_schonhage (listz_t R, listz_t A, listz_t B, unsigned int l, listz_t T) { unsigned long i; mp_size_t s, t = 0, size_t0, size_tmp; mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr; s = mpz_sizeinbase (A[0], 2); if ((double) l * (double) s < KS_MUL_THRESHOLD) { toomcook4 (R, A, B, l, T); return; } for (i = 0; i < l; i++) { if ((s = mpz_sizeinbase (A[i], 2)) > t) t = s; if ((s = mpz_sizeinbase (B[i], 2)) > t) t = s; } /* For n > 0, s = sizeinbase (n, 2) <==> 2^(s-1) <= n < 2^s. For n = 0, s = sizeinbase (n, 2) = 1 ==> n < 2^s. Hence all A[i], B[i] < 2^t */ /* Each coeff of A(x)*B(x) < l * 2^(2*t), so max number of bits in a coeff of T[0] * T[1] will be 2 * t + ceil(log_2(l)) */ s = t * 2; for (i = l - 1; i; s++, i >>= 1); /* ceil(log_2(l)) = 1+floor(log_2(l-1)) */ /* work out the corresponding number of limbs */ s = 1 + (s - 1) / GMP_NUMB_BITS; /* Note: s * (l - 1) + ceil(t/GMP_NUMB_BITS) should be faster, but no significant speedup was observed */ size_t0 = s * l; /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */ t0_ptr = (mp_ptr) malloc (2 * size_t0 * sizeof (mp_limb_t)); if (t0_ptr == NULL) { outputf (OUTPUT_ERROR, "Out of memory in kronecker_schonhage()\n"); exit (1); } t1_ptr = t0_ptr + size_t0; MPN_ZERO (t0_ptr, 2 * size_t0); for (i = 0; i < l; i++) { ASSERT(SIZ(A[i]) >= 0); if (SIZ(A[i])) MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i])); ASSERT(SIZ(B[i]) >= 0); if (SIZ(B[i])) MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i])); } t2_ptr = (mp_ptr) malloc (2 * size_t0 * sizeof (mp_limb_t)); if (t2_ptr == NULL) { free (t0_ptr); outputf (OUTPUT_ERROR, "Out of memory in kronecker_schonhage()\n"); exit (1); } /* mpn_mul_fft_full () allocates auxiliary memory of about 8n limbs, thus the total memory allocated by this function is about 12*size_t0. Since size_t0 is about 2*dF*limbs(modulus), this is about 24*dF*limbs(modulus). */ mpn_mul_fft_full (t2_ptr, t0_ptr, size_t0, t1_ptr, size_t0); for (i = 0; i < 2 * l - 1; i++) { size_tmp = s; MPN_NORMALIZE(t2_ptr + i * s, size_tmp); r_ptr = MPZ_REALLOC (R[i], size_tmp); if (size_tmp) MPN_COPY (r_ptr, t2_ptr + i * s, size_tmp); SIZ(R[i]) = size_tmp; } free (t0_ptr); free (t2_ptr); } /* Given a[0..m] and c[0..l], puts in b[0..n] the coefficients of degree m to n+m of rev(a)*c, i.e. b[0] = a[0]*c[0] + ... + a[i]*c[i] with i = min(m, l) ... b[k] = a[0]*c[k] + ... + a[i]*c[i+k] with i = min(m, l-k) ... b[n] = a[0]*c[n] + ... + a[i]*c[i+n] with i = min(m, l-n) [=l-n]. If rev=0, consider a instead of rev(a). Assumes n <= l. Return non-zero if an error occurred. */ #undef TEST_OLD_S int TMulKS (listz_t b, unsigned int n, listz_t a, unsigned int m, listz_t c, unsigned int l, mpz_t modulus, int rev) { unsigned long i, s = 0, t, k; mp_ptr ap, bp, cp; mp_size_t an, bn, cn; int ret = 0; /* default return value */ #ifdef TEST_OLD_S unsigned long s_old = 0, k_old; mp_size_t bn_old; #endif #ifdef DEBUG long st = cputime (); fprintf (ECM_STDOUT, "n=%u m=%u l=%u bits=%u n*bits=%u: ", n, m, l, mpz_sizeinbase (modulus, 2), n * mpz_sizeinbase (modulus, 2)); #endif ASSERT (n <= l); /* otherwise the upper coefficients of b are 0 */ if (l > n + m) l = n + m; /* otherwise, c has too many coeffs */ /* compute max bits of a[] and c[] */ for (i = 0; i <= m; i++) { if (mpz_sgn (a[i]) < 0) mpz_mod (a[i], a[i], modulus); if ((t = mpz_sizeinbase (a[i], 2)) > s) s = t; } for (i = 0; i <= l; i++) { if (mpz_sgn (c[i]) < 0) mpz_mod (c[i], c[i], modulus); if ((t = mpz_sizeinbase (c[i], 2)) > s) s = t; } #ifdef FFT_WRAP s ++; /* need one extra bit to determine sign of low(b) - high(b) */ #endif #ifdef TEST_OLD_S /* We used max(m,l) before. We compute the corresponding s for comparison. */ for (s_old = 2 * s, i = (m > l) ? m : l; i; s_old++, i >>= 1); #endif /* max coeff has 2*s+ceil(log2(min(m+1,l+1))) bits, i.e. 2*s + 1 + floor(log2(min(m,l))) */ for (s = 2 * s, i = (m < l) ? m : l; i; s++, i >>= 1); /* corresponding number of limbs */ s = 1 + (s - 1) / GMP_NUMB_BITS; #ifdef TEST_OLD_S s_old = 1 + (s_old - 1) / GMP_NUMB_BITS; #endif an = (m + 1) * s; cn = (l + 1) * s; bn = an + cn; /* a[0..m] needs (m+1) * s limbs */ ap = (mp_ptr) malloc (an * sizeof (mp_limb_t)); if (ap == NULL) { ret = 1; goto TMulKS_end; } cp = (mp_ptr) malloc (cn * sizeof (mp_limb_t)); if (cp == NULL) { ret = 1; goto TMulKS_free_ap; } MPN_ZERO (ap, an); MPN_ZERO (cp, cn); /* a is reverted */ for (i = 0; i <= m; i++) if (SIZ(a[i])) MPN_COPY (ap + ((rev) ? (m - i) : i) * s, PTR(a[i]), SIZ(a[i])); for (i = 0; i <= l; i++) if (SIZ(c[i])) MPN_COPY (cp + i * s, PTR(c[i]), SIZ(c[i])); #ifdef FFT_WRAP /* the product rev(a) * c has m+l+1 coefficients. We throw away the first m and the last l-n <= m. If we compute mod (m+n+1) * s limbs, we are ok */ k = mpn_fft_best_k ((m + n + 1) * s, 0); bn = mpn_fft_next_size ((m + n + 1) * s, k); #ifdef TEST_OLD_S k_old = mpn_fft_best_k ((m + n + 1) * s_old, 0); if (k != k_old) outputf (OUTPUT_ERROR, "Got different FFT transform length, k = %lu, k_old : %lu\n", k, k_old); bn_old = mpn_fft_next_size ((m + n + 1) * s_old, k_old); if (bn != bn_old) outputf (OUTPUT_ERROR, "Got different FFT size, bn = %d, bn_old : %d\n", (int) bn, (int) bn_old); #endif bp = (mp_ptr) malloc ((bn + 1) * sizeof (mp_limb_t)); if (bp == NULL) { ret = 1; goto TMulKS_free_cp; } mpn_mul_fft (bp, bn, ap, an, cp, cn, k); if (m && bp[m * s - 1] >> (GMP_NUMB_BITS - 1)) /* lo(b)-hi(b) is negative */ mpn_add_1 (bp + m * s, bp + m * s, (n + 1) * s, (mp_limb_t) 1); #else bp = (mp_ptr) malloc (bn * sizeof (mp_limb_t)); if (bp == NULL) { ret = 1; goto TMulKS_free_cp; } if (an >= cn) mpn_mul (bp, ap, an, cp, cn); else mpn_mul (bp, cp, cn, ap, an); #endif /* recover coefficients of degree m to n+m of product in b[0..n] */ bp += m * s; for (i = 0; i <= n; i++) { t = s; MPN_NORMALIZE(bp, t); MPZ_REALLOC (b[i], (mp_size_t) t); if (t) MPN_COPY (PTR(b[i]), bp, t); SIZ(b[i]) = t; bp += s; } bp -= (m + n + 1) * s; free (bp); TMulKS_free_cp: free (cp); TMulKS_free_ap: free (ap); #ifdef DEBUG fprintf (ECM_STDOUT, "%ums\n", elltime (st, cputime ())); #endif TMulKS_end: return ret; } #ifdef DEBUG void mpn_print (mp_ptr np, mp_size_t nn) { mp_size_t i; for (i = 0; i < nn; i++) fprintf (ECM_STDOUT, "+%lu*B^%u", np[i], i); fprintf (ECM_STDOUT, "\n"); } #endif unsigned int ks_wrapmul_m (unsigned int m0, unsigned int k, mpz_t n) { mp_size_t t, s; unsigned long i, fft_k, m; t = mpz_sizeinbase (n, 2); s = t * 2 + 1; for (i = k - 1; i; s++, i >>= 1); s = 1 + (s - 1) / GMP_NUMB_BITS; fft_k = mpn_fft_best_k (m0 * s, 0); i = mpn_fft_next_size (m0 * s, fft_k); while (i % s) i = mpn_fft_next_size (i + 1, fft_k); m = i / s; return m; } /* multiply in R[] A[0]+A[1]*x+...+A[k-1]*x^(k-1) by B[0]+B[1]*x+...+B[l-1]*x^(l-1) modulo n, wrapping around coefficients of the product up from degree m >= m0. Assumes k >= l. R is assumed to have 2*m0-3+list_mul_mem(m0-1) allocated cells. Return m (or 0 if an error occurred). */ unsigned int ks_wrapmul (listz_t R, unsigned int m0, listz_t A, unsigned int k, listz_t B, unsigned int l, mpz_t n) { unsigned long i, fft_k, m, t; mp_size_t s, size_t0, size_t1, size_tmp; mp_ptr t0_ptr, t1_ptr, t2_ptr, r_ptr, tp; int negative; ASSERT(k >= l); t = mpz_sizeinbase (n, 2); for (i = 0; i < k; i++) if (mpz_sgn (A[i]) < 0 || mpz_sizeinbase (A[i], 2) > t) mpz_mod (A[i], A[i], n); for (i = 0; i < l; i++) if (mpz_sgn (B[i]) < 0 || mpz_sizeinbase (B[i], 2) > t) mpz_mod (B[i], B[i], n); s = t * 2 + 1; /* one extra sign bit */ for (i = k - 1; i; s++, i >>= 1); s = 1 + (s - 1) / GMP_NUMB_BITS; size_t0 = s * k; size_t1 = s * l; /* allocate one double-buffer to save malloc/MPN_ZERO/free calls */ t0_ptr = (mp_ptr) malloc (size_t0 * sizeof (mp_limb_t)); if (t0_ptr == NULL) return 0; t1_ptr = (mp_ptr) malloc (size_t1 * sizeof (mp_limb_t)); if (t1_ptr == NULL) { free (t0_ptr); return 0; } MPN_ZERO (t0_ptr, size_t0); MPN_ZERO (t1_ptr, size_t1); for (i = 0; i < k; i++) if (SIZ(A[i])) MPN_COPY (t0_ptr + i * s, PTR(A[i]), SIZ(A[i])); for (i = 0; i < l; i++) if (SIZ(B[i])) MPN_COPY (t1_ptr + i * s, PTR(B[i]), SIZ(B[i])); fft_k = mpn_fft_best_k (m0 * s, 0); i = mpn_fft_next_size (m0 * s, fft_k); /* the following loop ensures we don't cut in the middle of a coefficient */ while (i % s) i = mpn_fft_next_size (i + 1, fft_k); m = i / s; ASSERT(m <= 2 * m0 - 3 + list_mul_mem (m0 - 1)); t2_ptr = (mp_ptr) malloc ((i + 1) * sizeof (mp_limb_t)); if (t2_ptr == NULL) { free (t0_ptr); free (t1_ptr); return 0; } mpn_mul_fft (t2_ptr, i, t0_ptr, size_t0, t1_ptr, size_t1, fft_k); for (i = 0, tp = t2_ptr, negative = 0; i < m; i++) { size_tmp = s; if (negative) /* previous was negative, add 1 */ mpn_add_1 (tp, tp, s, (mp_limb_t) 1); /* no need to check return value of mpn_add_1: if 1, then {tp, s} is now identically 0, and should remain so */ MPN_NORMALIZE(tp, size_tmp); if ((size_tmp == s) && (tp[s - 1] >> (GMP_NUMB_BITS - 1))) { negative = 1; mpn_com_n (tp, tp, s); mpn_add_1 (tp, tp, s, (mp_limb_t) 1); } else negative = 0; r_ptr = MPZ_REALLOC (R[i], size_tmp); if (size_tmp) MPN_COPY (r_ptr, tp, size_tmp); SIZ(R[i]) = (negative) ? -size_tmp : size_tmp; tp += s; } free (t0_ptr); free (t1_ptr); free (t2_ptr); return m; } ecm-6.4.4/ecm_ntt.c0000644023561000001540000002745412106741274011030 00000000000000/* ecm_ntt.c - high level poly functions to interface between ecm and sp Copyright 2005, 2006, 2007, 2008, 2009, 2011, 2012 Dave Newman, Paul Zimmermann, Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include "sp.h" #include "ecm-impl.h" #ifdef HAVE_UNISTD_H #include /* for unlink */ #endif #define UNUSED 0 /* memory: 4 * len mpspv coeffs */ void ntt_mul (mpzv_t r, mpzv_t x, mpzv_t y, spv_size_t len, mpzv_t t, int monic, mpzspm_t mpzspm) { mpzspv_t u, v; if (len < MUL_NTT_THRESHOLD) { list_mul (r, x, len, monic, y, len, monic, t); return; } u = mpzspv_init (2 * len, mpzspm); v = mpzspv_init (2 * len, mpzspm); mpzspv_from_mpzv (v, 0, y, len, mpzspm); mpzspv_from_mpzv (u, 0, x, len, mpzspm); mpzspv_mul_ntt(u, 0, u, 0, len, v, 0, len, 2 * len, monic, monic ? 2 * len : 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_to_mpzv (u, 0, r, 2 * len - 1 + monic, mpzspm); mpzspv_clear (u, mpzspm); mpzspv_clear (v, mpzspm); } /* memory: 2 * len mpzspv coeffs */ void ntt_PolyFromRoots (mpzv_t r, mpzv_t a, spv_size_t len, mpzv_t t, mpzspm_t mpzspm) { mpzspv_t x; spv_size_t i, m; ASSERT (len == ((spv_size_t)1) << ceil_log2 (len)); if (len <= MUL_NTT_THRESHOLD) { PolyFromRoots (r, a, len, t, mpzspm->modulus); return; } x = mpzspv_init (2 * len, mpzspm); for (i = 0; i < len; i += MUL_NTT_THRESHOLD) { PolyFromRoots (r, a + i, MUL_NTT_THRESHOLD, t, mpzspm->modulus); mpzspv_from_mpzv (x, 2 * i, r, MUL_NTT_THRESHOLD, mpzspm); } for (m = MUL_NTT_THRESHOLD; m < len; m *= 2) { for (i = 0; i < 2 * len; i += 4 * m) { mpzspv_mul_ntt (x, i, x, i, m, x, i + 2 * m, m, 2 * m, 1, 2 * m, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); if (2 * m < len) mpzspv_normalise (x, i, 2 * m, mpzspm); } } mpzspv_to_mpzv (x, 0, r, len, mpzspm); mpzspv_clear (x, mpzspm); } /* memory: 2 * len mpzspv coeffs */ int ntt_PolyFromRoots_Tree (mpzv_t r, mpzv_t a, spv_size_t len, mpzv_t t, int dolvl, mpzspm_t mpzspm, mpzv_t *Tree, FILE *TreeFile) { mpzspv_t x; spv_size_t i, m, m_max; mpzv_t src; mpzv_t *dst = Tree + ceil_log2 (len) - 1; ASSERT (len == ((spv_size_t)1) << ceil_log2 (len)); x = mpzspv_init (2 * len, mpzspm); if (dolvl >= 0) { src = a; dst = &r; } else { /* Copy the roots into the destination level of the tree (negating if so desired), set the source to this level (which now contains the possibly negated roots), and advance the destination level of the tree to the next level */ src = *dst; /* we consider x + root[i], which means we consider negated roots */ list_set (*dst--, a, len); } m = (dolvl == -1) ? 1 : 1 << (ceil_log2 (len) - 1 - dolvl); m_max = (dolvl == -1) ? len : 2 * m; for (; m < m_max && m < MUL_NTT_THRESHOLD; m *= 2) { /* dst = &r anyway for dolvl != -1 */ if (m == len / 2) dst = &r; if (TreeFile && list_out_raw (TreeFile, src, len) == ECM_ERROR) { outputf (OUTPUT_ERROR, "Error writing product tree of F\n"); return ECM_ERROR; } for (i = 0; i < len; i += 2 * m) list_mul (t + i, src + i, m, 1, src + i + m, m, 1, t + len); list_mod (*dst, t, len, mpzspm->modulus); src = *dst--; } for (; m < m_max; m *= 2) { ASSERT (m > 1); /* This code does not do the sign change. Let's assume MUL_NTT_THRESHOLD is always large enough that the degree 1 product are done in the above loop */ /* dst = &r anyway for dolvl != -1 */ if (m == len / 2) dst = &r; for (i = 0; i < 2 * len; i += 4 * m) { if (TreeFile && list_out_raw (TreeFile, src + i / 2, 2 * m) == ECM_ERROR) return ECM_ERROR; mpzspv_from_mpzv (x, i, src + i / 2, m, mpzspm); mpzspv_from_mpzv (x, i + 2 * m, src + i / 2 + m, m, mpzspm); mpzspv_mul_ntt (x, i, x, i, m, x, i + 2 * m, m, 2 * m, 1, 2 * m, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_to_mpzv (x, i, *dst + i / 2, 2 * m, mpzspm); /* we only do the mod reduction to reduce the file size a bit */ if (TreeFile) list_mod (*dst + i / 2, *dst + i / 2, 2 * m, mpzspm->modulus); } src = *dst--; } mpzspv_clear (x, mpzspm); return 0; } /* 2 NTTs of size 2 * len * 2 NTTs of size len * * memory: 2 * len mpzspv coeffs */ void ntt_PrerevertDivision (mpzv_t a, mpzv_t b, mpzv_t invb, mpzspv_t sp_b, mpzspv_t sp_invb, spv_size_t len, mpzv_t t, mpzspm_t mpzspm) { mpzspv_t x; if (len < PREREVERTDIVISION_NTT_THRESHOLD) { PrerevertDivision (a, b, invb, len, t, mpzspm->modulus); return; } x = mpzspv_init (2 * len, mpzspm); /* y = TOP (TOP (a) * invb) */ mpzspv_set_sp (x, 0, 0, len + 1, mpzspm); mpzspv_from_mpzv (x, len + 1, a + len, len - 1, mpzspm); mpzspv_mul_ntt (x, 0, x, 0, 2 * len, sp_invb, 0, UNUSED, 2 * len, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_normalise (x, 0, len, mpzspm); mpzspv_mul_ntt (x, 0, x, 0, len, sp_b, 0, UNUSED, len, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_to_mpzv (x, 0, t, len, mpzspm); mpzspv_clear (x, mpzspm); list_sub (t, t, a + len, len - 1); list_sub (a, a, t, len); /* can we avoid this mod without risking overflow later? */ list_mod (a, a, len, mpzspm->modulus); } /* memory: 7/2 * len mpzspv coeffs */ void ntt_PolyInvert (mpzv_t q, mpzv_t b, spv_size_t len, mpzv_t t, mpzspm_t mpzspm) { spv_size_t k = POLYINVERT_NTT_THRESHOLD / 2; mpzspv_t w, x, y, z; if (len < POLYINVERT_NTT_THRESHOLD) { PolyInvert (q, b, len, t, mpzspm->modulus); return; } PolyInvert (q + len - k, b + len - k, k, t, mpzspm->modulus); w = mpzspv_init (len / 2, mpzspm); x = mpzspv_init (len, mpzspm); y = mpzspv_init (len, mpzspm); z = mpzspv_init (len, mpzspm); mpzspv_from_mpzv (x, 0, q + len - k - 1, k + 1, mpzspm); mpzspv_from_mpzv (y, 0, b, len - 1, mpzspm); for (; k < len; k *= 2) { mpzspv_set (w, 0, x, 1, k, mpzspm); mpzspv_set (z, 0, y, len - 2 * k, 2 * k - 1, mpzspm); mpzspv_mul_ntt (z, 0, z, 0, 2 * k - 1, x, 0, k + 1, 2 * k, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_normalise (z, k, k, mpzspm); mpzspv_neg (z, 0, z, k, k, mpzspm); mpzspv_mul_ntt (x, 0, x, 0, 0, z, 0, k, 2 * k, 0, 0, mpzspm, NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); if (2 * k < len) mpzspv_normalise (x, k, k, mpzspm); mpzspv_set (x, 1, x, k, k, mpzspm); /* legal overlap */ mpzspv_set (x, k + 1, w, 0, MIN(k, len / 2 - 1), mpzspm); } mpzspv_to_mpzv (x, 1, q, len - POLYINVERT_NTT_THRESHOLD / 2, mpzspm); #if defined DEBUG ntt_mul (t, q, b, len, NULL, 0, mpzspm); list_mod (t, t, 2 * len - 1, mpzspm->modulus); spv_size_t i; for (i = len - 1; i < 2 * len - 2; i++) if (mpz_cmp_ui (t[i], 0)) printf ("error in ntt_PolyInvert\n"); if (mpz_cmp_ui (t[2 * len - 2], 1)) printf ("error in ntt_PolyInvert-\n"); #endif mpzspv_clear (w, mpzspm); mpzspv_clear (x, mpzspm); mpzspv_clear (y, mpzspm); mpzspv_clear (z, mpzspm); } /* memory: 4 * len mpzspv coeffs */ int ntt_polyevalT (mpzv_t b, spv_size_t len, mpzv_t *Tree, mpzv_t T, mpzspv_t sp_invF, mpzspm_t mpzspm, char *TreeFilenameStem) { spv_size_t m, i; FILE *TreeFile = NULL; /* assume this "small" malloc will not fail in normal usage */ char *TreeFilename = NULL; mpzv_t *Tree_orig = Tree; int level = 0; /* = ceil_log2 (len / m) - 1 */ mpzspv_t x = mpzspv_init (2 * len, mpzspm); mpzspv_t y = mpzspv_init (2 * len, mpzspm); if (TreeFilenameStem) { TreeFilename = (char *) malloc (strlen (TreeFilenameStem) + 1 + 2 + 1); if (TreeFilename == NULL) { fprintf (stderr, "Cannot allocate memory in ntt_polyevalT\n"); exit (1); } } mpzspv_from_mpzv (x, 0, b, len, mpzspm); mpzspv_mul_ntt(x, 0, x, 0, len, sp_invF, 0, UNUSED, 2 * len, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); mpzspv_normalise (x, len - 1, len, mpzspm); mpzspv_set (y, 0, x, len - 1, len, mpzspm); /* y = high (b * invF) */ mpzspv_reverse (y, 0, len, mpzspm); /* y = rev (high (b * invF)) */ for (m = len / 2; m >= POLYEVALT_NTT_THRESHOLD; m /= 2) { if (TreeFilenameStem) { Tree = &T; sprintf (TreeFilename, "%s.%d", TreeFilenameStem, level); TreeFile = fopen (TreeFilename, "rb"); if (TreeFile == NULL) { outputf (OUTPUT_ERROR, "Error opening file %s for product tree of F\n", TreeFilename); mpzspv_clear (x, mpzspm); mpzspv_clear (y, mpzspm); return ECM_ERROR; } list_inp_raw (*Tree, TreeFile, len); fclose (TreeFile); unlink (TreeFilename); } for (i = 0; i < len; i += 2 * m) { list_revert (*Tree + i, m); mpzspv_set_sp (x, 0, 1, 1, mpzspm); mpzspv_from_mpzv (x, 1, *Tree + i, m, mpzspm); /* x contains reversed monic poly */ mpzspv_mul_ntt (x, 0, x, 0, m + 1, y, i, 2 * m, 2 * m, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_FFT2 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); if (m > POLYEVALT_NTT_THRESHOLD) mpzspv_normalise (x, m, m, mpzspm); list_revert (*Tree + i + m, m); mpzspv_set_sp (x, 2 * m, 1, 1, mpzspm); mpzspv_from_mpzv (x, 2 * m + 1, *Tree + i + m, m, mpzspm); mpzspv_mul_ntt(x, 2 * m, x, 2 * m, m + 1, y, i, UNUSED, 2 * m, 0, 0, mpzspm, NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT); if (m > POLYEVALT_NTT_THRESHOLD) mpzspv_normalise (x, 3 * m, m, mpzspm); mpzspv_set (y, i, x, 3 * m, m, mpzspm); mpzspv_set (y, i + m, x, m, m, mpzspm); } Tree++; level++; } mpzspv_clear (x, mpzspm); mpzspv_to_mpzv (y, 0, T, len, mpzspm); /* T = rev (high (b * invF)) */ mpzspv_clear (y, mpzspm); for (i = 0; i < len; i++) mpz_mod (T[i], T[i], mpzspm->modulus); for (; m >= 1; m /= 2) { if (TreeFilenameStem) { sprintf (TreeFilename, "%s.%d", TreeFilenameStem, level); TreeFile = fopen (TreeFilename, "rb"); if (TreeFile == NULL) { outputf (OUTPUT_ERROR, "Error opening file %s for product tree of F\n", TreeFilename); return ECM_ERROR; } } TUpTree (T, Tree_orig, len, T + len, level++, 0, mpzspm->modulus, TreeFile); if (TreeFilenameStem) { fclose (TreeFile); unlink (TreeFilename); } } if (TreeFilenameStem) free (TreeFilename); list_swap (b, T, len); return 0; } ecm-6.4.4/auxi.c0000644023561000001540000000760212106741273010335 00000000000000/* Auxiliary functions for GMP-ECM. Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2011, 2012 Paul Zimmermann, Alexander Kruppa, Laurent Fousse, Jim Fougeron, Cyril Bouvier. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "ecm-ecm.h" /****************************************************************************** * * * Auxiliary functions * * * ******************************************************************************/ /* returns the number of decimal digits of n */ unsigned int nb_digits (const mpz_t n) { mpz_t x; unsigned int size; size = mpz_sizeinbase (n, 10); /* the GMP documentation says mpz_sizeinbase returns the exact value, or one too big, thus: (a) either n < 10^(size-1), and n has size-1 digits (b) or n >= size-1, and n has size digits Note: mpz_sizeinbase returns 1 for n=0, thus we always have size >= 1. */ mpz_init (x); mpz_ui_pow_ui (x, 10, size - 1); if (mpz_cmpabs (n, x) < 0) size --; mpz_clear (x); return size; } /* Tries to read a number from a line from fd and stores it in r. Keeps reading lines until a number is found. Lines beginning with "#" are skipped. Returns 1 if a number was successfully read, 0 if no number can be read (i.e. at EOF) Function is now simpler. Much of the logic (other than skipping # lines is now contained within eval() function. */ int read_number (mpcandi_t *n, FILE *fd, int primetest) { int c; new_line: c = fgetc (fd); /* Skip comment lines beginning with '#' */ if (c == '#') { do c = fgetc (fd); while (c != EOF && !IS_NEWLINE(c)); if (IS_NEWLINE(c)) goto new_line; } if (c == EOF) return 0; ungetc (c, fd); if (!eval (n, fd, primetest)) goto new_line; #if 0 /* Code to test out eval_str function, which "appears" to work correctly. */ { /* warning!! Line is pretty small, but since this is just testing code, we can easily control the input for this test. This code should NEVER be compiled into released build, its only for testing of eval_str() */ char Line[500], *cp; fgets (Line, sizeof(Line), fd); if (!eval_str (n, Line, primetest, &cp)) goto new_line; fprintf (stderr, "\nLine is at %X cp is at %X\n", Line, cp); } #endif #if defined (DEBUG_EVALUATOR) if (n->cpExpr) fprintf (stderr, "%s\n", n->cpExpr); mpz_out_str (stderr, 10, n->n); fprintf (stderr, "\n"); #endif return 1; } int probab_prime_p (mpz_t N, int reps) { #ifdef WANT_SHELLCMD if (prpcmd != NULL) { FILE *fc; int r; fc = popen (prpcmd, "w"); if (fc != NULL) { gmp_fprintf (fc, "%Zd\n", N); r = pclose (fc); if (r == 0) /* Exit status of 0 means success = is a PRP */ return 1; else return 0; } else { fprintf (stderr, "Error executing the PRP command\n"); exit (EXIT_FAILURE); } } else #endif return mpz_probab_prime_p (N, reps); } ecm-6.4.4/ecm-params.h.pentium30000644023561000001540000000072112106741273013157 00000000000000#define MPZMOD_THRESHOLD 135 #define REDC_THRESHOLD 200 #define MPN_MUL_LO_THRESHOLD_TABLE {0, 0, 1, 1, 0, 0, 5, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 1, 14, 14, 16, 16, 1} #define NTT_GFP_TWIDDLE_DIF_BREAKOVER 11 #define NTT_GFP_TWIDDLE_DIT_BREAKOVER 11 #define MUL_NTT_THRESHOLD 262144 #define PREREVERTDIVISION_NTT_THRESHOLD 128 #define POLYINVERT_NTT_THRESHOLD 65536 #define POLYEVALT_NTT_THRESHOLD 16384 #define MPZSPV_NORMALISE_STRIDE 512 ecm-6.4.4/mpzspv.c0000644023561000001540000007253712106741273010737 00000000000000/* mpzspv.c - "mpz small prime polynomial" functions for arithmetic on mpzv's reduced modulo a mpzspm Copyright 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Dave Newman, Jason Papadopoulos, Alexander Kruppa, Paul Zimmermann. The SP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The SP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the SP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include /* for stderr */ #include #include /* for memset */ #include "sp.h" mpzspv_t mpzspv_init (spv_size_t len, mpzspm_t mpzspm) { unsigned int i; mpzspv_t x = (mpzspv_t) malloc (mpzspm->sp_num * sizeof (spv_t)); if (x == NULL) return NULL; for (i = 0; i < mpzspm->sp_num; i++) { x[i] = (spv_t) sp_aligned_malloc (len * sizeof (sp_t)); if (x[i] == NULL) { while (i--) sp_aligned_free (x[i]); free (x); return NULL; } } return x; } void mpzspv_clear (mpzspv_t x, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (x, 0, 0, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) sp_aligned_free (x[i]); free (x); } /* check that: * - each of the spv's is at least offset + len long * - the data specified by (offset, len) is correctly normalised in the * range [0, sp) * * return 1 for success, 0 for failure */ int mpzspv_verify (mpzspv_t x, spv_size_t offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; spv_size_t j; for (i = 0; i < mpzspm->sp_num; i++) { for (j = offset; j < offset + len; j++) if (x[i][j] >= mpzspm->spm[i]->sp) return 0; } return 1; } void mpzspv_set (mpzspv_t r, spv_size_t r_offset, mpzspv_t x, spv_size_t x_offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, r_offset + len, 0, mpzspm)); ASSERT (mpzspv_verify (x, x_offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_set (r[i] + r_offset, x[i] + x_offset, len); } void mpzspv_revcopy (mpzspv_t r, spv_size_t r_offset, mpzspv_t x, spv_size_t x_offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, r_offset + len, 0, mpzspm)); ASSERT (mpzspv_verify (x, x_offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_rev (r[i] + r_offset, x[i] + x_offset, len); } void mpzspv_set_sp (mpzspv_t r, spv_size_t offset, sp_t c, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, offset + len, 0, mpzspm)); ASSERT (c < SP_MIN); /* not strictly necessary but avoids mod functions */ for (i = 0; i < mpzspm->sp_num; i++) spv_set_sp (r[i] + offset, c, len); } void mpzspv_neg (mpzspv_t r, spv_size_t r_offset, mpzspv_t x, spv_size_t x_offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, r_offset + len, 0, mpzspm)); ASSERT (mpzspv_verify (x, x_offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_neg (r[i] + r_offset, x[i] + x_offset, len, mpzspm->spm[i]->sp); } void mpzspv_add (mpzspv_t r, spv_size_t r_offset, mpzspv_t x, spv_size_t x_offset, mpzspv_t y, spv_size_t y_offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, r_offset + len, 0, mpzspm)); ASSERT (mpzspv_verify (x, x_offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_add (r[i] + r_offset, x[i] + x_offset, y[i] + y_offset, len, mpzspm->spm[i]->sp); } void mpzspv_reverse (mpzspv_t x, spv_size_t offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; spv_size_t j; sp_t t; spv_t spv; ASSERT (mpzspv_verify (x, offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) { spv = x[i] + offset; for (j = 0; j < len - 1 - j; j++) { t = spv[j]; spv[j] = spv[len - 1 - j]; spv[len - 1 - j] = t; } } } /* Return {xp, xn} mod p. Assume 2p < B where B = 2^GMP_NUMB_LIMB. We first compute {xp, xn} / B^n mod p using Montgomery reduction, where the number N to factor has n limbs. Then we multiply by B^(n+1) mod p (precomputed) and divide by B mod p. Assume invm = -1/p mod B and Bpow = B^n mod p */ static mp_limb_t ecm_mod_1 (mp_ptr xp, mp_size_t xn, mp_limb_t p, mp_size_t n, mp_limb_t invm, mp_limb_t Bpow) { mp_limb_t q, cy, hi, lo, x0, x1; if (xn == 0) return 0; /* the code below assumes xn <= n+1, thus we call mpn_mod_1 otherwise, but this should never (or rarely) happen */ if (xn > n + 1) return mpn_mod_1 (xp, xn, p); x0 = xp[0]; cy = (mp_limb_t) 0; while (n-- > 0) { /* Invariant: cy is the input carry on xp[1], x0 is xp[0] */ x1 = (xn > 1) ? xp[1] : 0; q = x0 * invm; /* q = -x0/p mod B */ umul_ppmm (hi, lo, q, p); /* hi*B + lo = -x0 mod B */ /* Add hi*B + lo to x1*B + x0. Since p <= B-2 we have hi*B + lo <= (B-1)(B-2) = B^2-3B+2, thus hi <= B-3 */ hi += cy + (lo != 0); /* cannot overflow */ x0 = x1 + hi; cy = x0 < hi; xn --; xp ++; } if (cy != 0) x0 -= p; /* now x0 = {xp, xn} / B^n mod p */ umul_ppmm (x1, x0, x0, Bpow); /* since Bpow < p, x1 <= p-1 */ q = x0 * invm; umul_ppmm (hi, lo, q, p); /* hi <= p-1 thus hi+x1+1 < 2p-1 < B */ hi = hi + x1 + (lo != 0); while (hi >= p) hi -= p; return hi; } /* convert mpzvi to CRT representation, naive version */ static void mpzspv_from_mpzv_slow (mpzspv_t x, const spv_size_t offset, mpz_t mpzvi, mpzspm_t mpzspm) { const unsigned int sp_num = mpzspm->sp_num; unsigned int j; mp_size_t n = mpz_size (mpzspm->modulus); /* GMP's comments on mpn_preinv_mod_1: * * "This function used to be documented, but is now considered obsolete. It * continues to exist for binary compatibility, even when not required * internally." * * It doesn't accept 0 as the dividend so we have to treat this case * separately */ /* Note: we can't use the mul_c field for mpn_preinv_mod_1, since on 64-bit it is floor(2^125/sp) where sp has 62 bits, and mpn_preinv_mod_1 needs floor(2^128/(4*sp))-2^64 = floor(2^126/sp)-2^64. On 32-bit it is floor(2^62/sp) where sp has 31 bits, and mpn_preinv_mod_1 needs floor(2^64/(2*sp))-2^32 = floor(2^63/sp)-2^32. */ /* Note: we could improve this as follows. Assume the number N to factor has n limbs. Instead of computing v mod p by reducing v by the high limbs, we first compute v/B^(n-1) mod p by reducing v by the low limbs, then deduce v mod p using a precomputed value of B^(n-1) mod p. The reduction v/B is done by using a precomputed k = 1/B mod p, thus v1*B+v0 = (v1+k*v0)*B and so on. */ for (j = 0; j < sp_num; j++) x[j][offset] = ecm_mod_1 (PTR(mpzvi), SIZ(mpzvi), (mp_limb_t) mpzspm->spm[j]->sp, n, mpzspm->spm[j]->invm, mpzspm->spm[j]->Bpow); /* The typecast to mp_limb_t assumes that mp_limb_t is at least as wide as sp_t */ } /* convert mpzvi to CRT representation, fast version, assumes mpzspm->T has been precomputed (see mpzspm.c) */ static void mpzspv_from_mpzv_fast (mpzspv_t x, const spv_size_t offset, mpz_t mpzvi, mpzspm_t mpzspm) { const unsigned int sp_num = mpzspm->sp_num; unsigned int i, j, k, i0 = I0_THRESHOLD, I0; mpzv_t *T = mpzspm->T; unsigned int d = mpzspm->d, ni; ASSERT (d > i0); /* T[0] serves as vector of temporary mpz_t's, since it contains the small primes, which are also in mpzspm->spm[j]->sp */ /* initially we split mpzvi in two */ ni = 1 << (d - 1); mpz_mod (T[0][0], mpzvi, T[d-1][0]); mpz_mod (T[0][ni], mpzvi, T[d-1][1]); for (i = d-1; i-- > i0;) { /* goes down from depth i+1 to i */ ni = 1 << i; for (j = k = 0; j + ni < sp_num; j += 2*ni, k += 2) { mpz_mod (T[0][j+ni], T[0][j], T[i][k+1]); mpz_mod (T[0][j], T[0][j], T[i][k]); } /* for the last entry T[0][j] if j < sp_num, there is nothing to do */ } /* last steps */ I0 = 1 << i0; for (j = 0; j < sp_num; j += I0) for (k = j; k < j + I0 && k < sp_num; k++) x[k][offset] = mpn_mod_1 (PTR(T[0][j]), SIZ(T[0][j]), (mp_limb_t) mpzspm->spm[k]->sp); /* The typecast to mp_limb_t assumes that mp_limb_t is at least as wide as sp_t */ } /* convert an array of len mpz_t numbers to CRT representation modulo sp_num moduli */ void mpzspv_from_mpzv (mpzspv_t x, const spv_size_t offset, const mpzv_t mpzv, const spv_size_t len, mpzspm_t mpzspm) { const unsigned int sp_num = mpzspm->sp_num; long i; ASSERT (mpzspv_verify (x, offset + len, 0, mpzspm)); ASSERT (sizeof (mp_limb_t) >= sizeof (sp_t)); #if defined(_OPENMP) #pragma omp parallel private(i) if (len > 16384) { /* Multi-threading with dynamic scheduling slows things down */ #pragma omp for schedule(static) #endif for (i = 0; i < (long) len; i++) { unsigned int j; if (mpz_sgn (mpzv[i]) == 0) { for (j = 0; j < sp_num; j++) x[j][i + offset] = 0; } else { ASSERT(mpz_sgn (mpzv[i]) > 0); /* We can't handle negative values */ if (mpzspm->T == NULL) mpzspv_from_mpzv_slow (x, i + offset, mpzv[i], mpzspm); else mpzspv_from_mpzv_fast (x, i + offset, mpzv[i], mpzspm); } } #if defined(_OPENMP) } #endif } /* See: Daniel J. Bernstein and Jonathan P. Sorenson, * Modular Exponentiation via the explicit Chinese Remainder Theorem * * memory: MPZSPV_NORMALISE_STRIDE floats */ void mpzspv_to_mpzv (mpzspv_t x, spv_size_t offset, mpzv_t mpzv, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; spv_size_t k, l; float *f = (float *) malloc (MPZSPV_NORMALISE_STRIDE * sizeof (float)); float prime_recip; sp_t t; spm_t *spm = mpzspm->spm; mpz_t mt; if (f == NULL) { fprintf (stderr, "Cannot allocate memory in mpzspv_to_mpzv\n"); exit (1); } ASSERT (mpzspv_verify (x, offset, len, mpzspm)); mpz_init (mt); for (l = 0; l < len; l += MPZSPV_NORMALISE_STRIDE) { spv_size_t stride = MIN (MPZSPV_NORMALISE_STRIDE, len - l); for (k = 0; k < stride; k++) { f[k] = 0.5; mpz_set_ui (mpzv[k + l], 0); } for (i = 0; i < mpzspm->sp_num; i++) { prime_recip = 1.0f / (float) spm[i]->sp; for (k = 0; k < stride; k++) { t = sp_mul (x[i][l + k + offset], mpzspm->crt3[i], spm[i]->sp, spm[i]->mul_c); if (sizeof (sp_t) > sizeof (unsigned long)) { mpz_set_sp (mt, t); mpz_addmul (mpzv[l + k], mpzspm->crt1[i], mt); } else { mpz_addmul_ui (mpzv[l + k], mpzspm->crt1[i], t); } f[k] += (float) t * prime_recip; } } for (k = 0; k < stride; k++) mpz_add (mpzv[l + k], mpzv[l + k], mpzspm->crt2[(unsigned int) f[k]]); } mpz_clear (mt); free (f); } void mpzspv_pwmul (mpzspv_t r, spv_size_t r_offset, mpzspv_t x, spv_size_t x_offset, mpzspv_t y, spv_size_t y_offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (r, r_offset + len, 0, mpzspm)); ASSERT (mpzspv_verify (x, x_offset, len, mpzspm)); ASSERT (mpzspv_verify (y, y_offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_pwmul (r[i] + r_offset, x[i] + x_offset, y[i] + y_offset, len, mpzspm->spm[i]->sp, mpzspm->spm[i]->mul_c); } /* B&S: ecrt mod m mod p_j. * * memory: MPZSPV_NORMALISE_STRIDE mpzspv coeffs * 6 * MPZSPV_NORMALISE_STRIDE sp's * MPZSPV_NORMALISE_STRIDE floats */ void mpzspv_normalise (mpzspv_t x, spv_size_t offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i, j, sp_num = mpzspm->sp_num; spv_size_t k, l; sp_t v; spv_t s, d, w; spm_t *spm = mpzspm->spm; float prime_recip; float *f; mpzspv_t t; ASSERT (mpzspv_verify (x, offset, len, mpzspm)); f = (float *) malloc (MPZSPV_NORMALISE_STRIDE * sizeof (float)); s = (spv_t) malloc (3 * MPZSPV_NORMALISE_STRIDE * sizeof (sp_t)); d = (spv_t) malloc (3 * MPZSPV_NORMALISE_STRIDE * sizeof (sp_t)); if (f == NULL || s == NULL || d == NULL) { fprintf (stderr, "Cannot allocate memory in mpzspv_normalise\n"); exit (1); } t = mpzspv_init (MPZSPV_NORMALISE_STRIDE, mpzspm); memset (s, 0, 3 * MPZSPV_NORMALISE_STRIDE * sizeof (sp_t)); for (l = 0; l < len; l += MPZSPV_NORMALISE_STRIDE) { spv_size_t stride = MIN (MPZSPV_NORMALISE_STRIDE, len - l); /* FIXME: use B&S Theorem 2.2 */ for (k = 0; k < stride; k++) f[k] = 0.5; for (i = 0; i < sp_num; i++) { prime_recip = 1.0f / (float) spm[i]->sp; for (k = 0; k < stride; k++) { x[i][l + k + offset] = sp_mul (x[i][l + k + offset], mpzspm->crt3[i], spm[i]->sp, spm[i]->mul_c); f[k] += (float) x[i][l + k + offset] * prime_recip; } } for (i = 0; i < sp_num; i++) { for (k = 0; k < stride; k++) { umul_ppmm (d[3 * k + 1], d[3 * k], mpzspm->crt5[i], (sp_t) f[k]); d[3 * k + 2] = 0; } for (j = 0; j < sp_num; j++) { w = x[j] + offset; v = mpzspm->crt4[i][j]; for (k = 0; k < stride; k++) umul_ppmm (s[3 * k + 1], s[3 * k], w[k + l], v); /* this mpn_add_n accounts for about a third of the function's * runtime */ mpn_add_n (d, d, s, 3 * stride); } for (k = 0; k < stride; k++) t[i][k] = mpn_mod_1 (d + 3 * k, 3, spm[i]->sp); } mpzspv_set (x, l + offset, t, 0, stride, mpzspm); } mpzspv_clear (t, mpzspm); free (s); free (d); free (f); } void mpzspv_to_ntt (mpzspv_t x, spv_size_t offset, spv_size_t len, spv_size_t ntt_size, int monic, mpzspm_t mpzspm) { unsigned int i; spv_size_t j, log2_ntt_size; spm_t spm; spv_t spv; ASSERT (mpzspv_verify (x, offset, len, mpzspm)); ASSERT (mpzspv_verify (x, offset + ntt_size, 0, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); for (i = 0; i < mpzspm->sp_num; i++) { spm = mpzspm->spm[i]; spv = x[i] + offset; if (ntt_size < len) { for (j = ntt_size; j < len; j += ntt_size) spv_add (spv, spv, spv + j, ntt_size, spm->sp); } if (ntt_size > len) spv_set_zero (spv + len, ntt_size - len); if (monic) spv[len % ntt_size] = sp_add (spv[len % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spv, log2_ntt_size, spm); } } void mpzspv_from_ntt (mpzspv_t x, spv_size_t offset, spv_size_t ntt_size, spv_size_t monic_pos, mpzspm_t mpzspm) { unsigned int i; spv_size_t log2_ntt_size; spm_t spm; spv_t spv; ASSERT (mpzspv_verify (x, offset, ntt_size, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); for (i = 0; i < mpzspm->sp_num; i++) { spm = mpzspm->spm[i]; spv = x[i] + offset; spv_ntt_gfp_dit (spv, log2_ntt_size, spm); /* spm->sp - (spm->sp - 1) / ntt_size is the inverse of ntt_size */ spv_mul_sp (spv, spv, spm->sp - (spm->sp - 1) / ntt_size, ntt_size, spm->sp, spm->mul_c); if (monic_pos) spv[monic_pos % ntt_size] = sp_sub (spv[monic_pos % ntt_size], 1, spm->sp); } } void mpzspv_random (mpzspv_t x, spv_size_t offset, spv_size_t len, mpzspm_t mpzspm) { unsigned int i; ASSERT (mpzspv_verify (x, offset, len, mpzspm)); for (i = 0; i < mpzspm->sp_num; i++) spv_random (x[i] + offset, len, mpzspm->spm[i]->sp); } /* Do multiplication via NTT. Depending on the value of "steps", does in-place forward transform of x, in-place forward transform of y, pair-wise multiplication of x by y to r, in-place inverse transform of r. Contrary to calling these three operations separately, this function does all three steps on a small-prime vector at a time, resulting in slightly better cache efficiency (also in preparation to storing NTT vectors on disk and reading them in for the multiplication). */ void mpzspv_mul_ntt (mpzspv_t r, const spv_size_t offsetr, mpzspv_t x, const spv_size_t offsetx, const spv_size_t lenx, mpzspv_t y, const spv_size_t offsety, const spv_size_t leny, const spv_size_t ntt_size, const int monic, const spv_size_t monic_pos, mpzspm_t mpzspm, const int steps) { spv_size_t log2_ntt_size; int i; ASSERT (mpzspv_verify (x, offsetx, lenx, mpzspm)); ASSERT (mpzspv_verify (y, offsety, leny, mpzspm)); ASSERT (mpzspv_verify (x, offsetx + ntt_size, 0, mpzspm)); ASSERT (mpzspv_verify (y, offsety + ntt_size, 0, mpzspm)); ASSERT (mpzspv_verify (r, offsetr + ntt_size, 0, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); /* Need parallelization at higher level (e.g., handling a branch of the product tree in one thread) to make this worthwhile for ECM */ #define MPZSPV_MUL_NTT_OPENMP 0 #if defined(_OPENMP) && MPZSPV_MUL_NTT_OPENMP #pragma omp parallel if (ntt_size > 16384) { #pragma omp for #endif for (i = 0; i < (int) mpzspm->sp_num; i++) { spv_size_t j; spm_t spm = mpzspm->spm[i]; spv_t spvr = r[i] + offsetr; spv_t spvx = x[i] + offsetx; spv_t spvy = y[i] + offsety; if ((steps & NTT_MUL_STEP_FFT1) != 0) { if (ntt_size < lenx) { for (j = ntt_size; j < lenx; j += ntt_size) spv_add (spvx, spvx, spvx + j, ntt_size, spm->sp); } if (ntt_size > lenx) spv_set_zero (spvx + lenx, ntt_size - lenx); if (monic) spvx[lenx % ntt_size] = sp_add (spvx[lenx % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spvx, log2_ntt_size, spm); } if ((steps & NTT_MUL_STEP_FFT2) != 0) { if (ntt_size < leny) { for (j = ntt_size; j < leny; j += ntt_size) spv_add (spvy, spvy, spvy + j, ntt_size, spm->sp); } if (ntt_size > leny) spv_set_zero (spvy + leny, ntt_size - leny); if (monic) spvy[leny % ntt_size] = sp_add (spvy[leny % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spvy, log2_ntt_size, spm); } if ((steps & NTT_MUL_STEP_MUL) != 0) { spv_pwmul (spvr, spvx, spvy, ntt_size, spm->sp, spm->mul_c); } if ((steps & NTT_MUL_STEP_IFFT) != 0) { ASSERT (sizeof (mp_limb_t) >= sizeof (sp_t)); spv_ntt_gfp_dit (spvr, log2_ntt_size, spm); /* spm->sp - (spm->sp - 1) / ntt_size is the inverse of ntt_size */ spv_mul_sp (spvr, spvr, spm->sp - (spm->sp - 1) / ntt_size, ntt_size, spm->sp, spm->mul_c); if (monic_pos) spvr[monic_pos % ntt_size] = sp_sub (spvr[monic_pos % ntt_size], 1, spm->sp); } } #if defined(_OPENMP) && MPZSPV_MUL_NTT_OPENMP } #endif } /* Computes a DCT-I of the length dctlen. Input is the spvlen coefficients in spv. tmp is temp space and must have space for 2*dctlen-2 sp_t's */ void mpzspv_to_dct1 (mpzspv_t dct, const mpzspv_t spv, const spv_size_t spvlen, const spv_size_t dctlen, mpzspv_t tmp, const mpzspm_t mpzspm) { const spv_size_t l = 2 * (dctlen - 1); /* Length for the DFT */ const spv_size_t log2_l = ceil_log_2 (l); int j; #ifdef _OPENMP #pragma omp parallel private(j) { #pragma omp for #endif for (j = 0; j < (int) mpzspm->sp_num; j++) { const spm_t spm = mpzspm->spm[j]; spv_size_t i; /* Make a symmetric copy of spv in tmp. I.e. with spv = [3, 2, 1], spvlen = 3, dctlen = 5 (hence l = 8), we want tmp = [3, 2, 1, 0, 0, 0, 1, 2] */ spv_set (tmp[j], spv[j], spvlen); spv_rev (tmp[j] + l - spvlen + 1, spv[j] + 1, spvlen - 1); /* Now we have [3, 2, 1, ?, ?, ?, 1, 2]. Fill the ?'s with zeros. */ spv_set_sp (tmp[j] + spvlen, (sp_t) 0, l - 2 * spvlen + 1); #if 0 printf ("mpzspv_to_dct1: tmp[%d] = [", j); for (i = 0; i < l; i++) printf ("%lu, ", tmp[j][i]); printf ("]\n"); #endif spv_ntt_gfp_dif (tmp[j], log2_l, spm); #if 0 printf ("mpzspv_to_dct1: tmp[%d] = [", j); for (i = 0; i < l; i++) printf ("%lu, ", tmp[j][i]); printf ("]\n"); #endif /* The forward transform is scrambled. We want elements [0 ... l/2] of the unscrabled data, that is all the coefficients with the most significant bit in the index (in log2(l) word size) unset, plus the element at index l/2. By scrambling, these map to the elements with even index, plus the element at index 1. The elements with scrambled index 2*i are stored in h[i], the element with scrambled index 1 is stored in h[params->l] */ #ifdef WANT_ASSERT /* Test that the coefficients are symmetric (if they were unscrambled) and that our algorithm for finding identical coefficients in the scrambled data works */ { spv_size_t m = 5; for (i = 2; i < l; i += 2L) { /* This works, but why? */ if (i + i / 2L > m) m = 2L * m + 1L; ASSERT (tmp[j][i] == tmp[j][m - i]); #if 0 printf ("mpzspv_to_dct1: DFT[%lu] == DFT[%lu]\n", i, m - i); #endif } } #endif /* Copy coefficients to dct buffer */ for (i = 0; i < l / 2; i++) dct[j][i] = tmp[j][i * 2]; dct[j][l / 2] = tmp[j][1]; } #ifdef _OPENMP } #endif } /* Multiply the polynomial in "dft" by the RLP in "dct", where "dft" contains the polynomial coefficients (not FFT'd yet) and "dct" contains the DCT-I coefficients of the RLP. The latter are assumed to be in the layout produced by mpzspv_to_dct1(). Output are the coefficients of the product polynomial, stored in dft. The "steps" parameter controls which steps are computed: NTT_MUL_STEP_FFT1: do forward transform NTT_MUL_STEP_MUL: do point-wise product NTT_MUL_STEP_IFFT: do inverse transform */ void mpzspv_mul_by_dct (mpzspv_t dft, const mpzspv_t dct, const spv_size_t len, const mpzspm_t mpzspm, const int steps) { int j; spv_size_t log2_len = ceil_log_2 (len); #ifdef _OPENMP #pragma omp parallel private(j) { #pragma omp for #endif for (j = 0; j < (int) (mpzspm->sp_num); j++) { const spm_t spm = mpzspm->spm[j]; const spv_t spv = dft[j]; unsigned long i, m; /* Forward DFT of dft[j] */ if ((steps & NTT_MUL_STEP_FFT1) != 0) spv_ntt_gfp_dif (spv, log2_len, spm); /* Point-wise product */ if ((steps & NTT_MUL_STEP_MUL) != 0) { m = 5UL; spv[0] = sp_mul (spv[0], dct[j][0], spm->sp, spm->mul_c); spv[1] = sp_mul (spv[1], dct[j][len / 2UL], spm->sp, spm->mul_c); for (i = 2UL; i < len; i += 2UL) { /* This works, but why? */ if (i + i / 2UL > m) m = 2UL * m + 1; spv[i] = sp_mul (spv[i], dct[j][i / 2UL], spm->sp, spm->mul_c); spv[m - i] = sp_mul (spv[m - i], dct[j][i / 2UL], spm->sp, spm->mul_c); } } /* Inverse transform of dft[j] */ if ((steps & NTT_MUL_STEP_IFFT) != 0) { spv_ntt_gfp_dit (spv, log2_len, spm); /* Divide by transform length. FIXME: scale the DCT of h instead */ spv_mul_sp (spv, spv, spm->sp - (spm->sp - 1) / len, len, spm->sp, spm->mul_c); } } #ifdef _OPENMP } #endif } void mpzspv_sqr_reciprocal (mpzspv_t dft, const spv_size_t n, const mpzspm_t mpzspm) { const spv_size_t log2_n = ceil_log_2 (n); const spv_size_t len = ((spv_size_t) 2) << log2_n; const spv_size_t log2_len = 1 + log2_n; int j; ASSERT(mpzspm->max_ntt_size % 3UL == 0UL); ASSERT(len % 3UL != 0UL); ASSERT(mpzspm->max_ntt_size % len == 0UL); #ifdef _OPENMP #pragma omp parallel { #pragma omp for #endif for (j = 0; j < (int) (mpzspm->sp_num); j++) { const spm_t spm = mpzspm->spm[j]; const spv_t spv = dft[j]; sp_t w1, w2, invlen; const sp_t sp = spm->sp, mul_c = spm->mul_c; spv_size_t i; /* Zero out NTT elements [n .. len-n] */ spv_set_sp (spv + n, (sp_t) 0, len - 2*n + 1); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) { printf ("ntt_sqr_reciprocal: NTT vector mod %lu\n", sp); ntt_print_vec ("ntt_sqr_reciprocal: before weighting:", spv, len); } #endif /* Compute the root for the weight signal, a 3rd primitive root of unity */ w1 = sp_pow (spm->prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); /* Compute iw= 1/w */ w2 = sp_pow (spm->inv_prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) printf ("w1 = %lu ,w2 = %lu\n", w1, w2); #endif ASSERT(sp_mul(w1, w2, sp, mul_c) == (sp_t) 1); ASSERT(w1 != (sp_t) 1); ASSERT(sp_pow (w1, 3UL, sp, mul_c) == (sp_t) 1); ASSERT(w2 != (sp_t) 1); ASSERT(sp_pow (w2, 3UL, sp, mul_c) == (sp_t) 1); /* Fill NTT elements spv[len-n+1 .. len-1] with coefficients and apply weight signal to spv[i] and spv[l-i] for 0 <= i < n Use the fact that w^i + w^{-i} = -1 if i != 0 (mod 3). */ for (i = 0; i + 2 < n; i += 3) { sp_t t, u; if (i > 0) spv[len - i] = spv[i]; t = spv[i + 1]; u = sp_mul (t, w1, sp, mul_c); spv[i + 1] = u; spv[len - i - 1] = sp_neg (sp_add (t, u, sp), sp); t = spv[i + 2]; u = sp_mul (t, w2, sp, mul_c); spv[i + 2] = u; spv[len - i - 2] = sp_neg (sp_add (t, u, sp), sp); } if (i < n && i > 0) { spv[len - i] = spv[i]; } if (i + 1 < n) { sp_t t, u; t = spv[i + 1]; u = sp_mul (t, w1, sp, mul_c); spv[i + 1] = u; spv[len - i - 1] = sp_neg (sp_add (t, u, sp), sp); } #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after weighting:", spv, len); #endif /* Forward DFT of dft[j] */ spv_ntt_gfp_dif (spv, log2_len, spm); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after forward transform:", spv, len); #endif /* Square the transformed vector point-wise */ spv_pwmul (spv, spv, spv, len, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after point-wise squaring:", spv, len); #endif /* Inverse transform of dft[j] */ spv_ntt_gfp_dit (spv, log2_len, spm); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after inverse transform:", spv, len); #endif /* Un-weight and divide by transform length */ invlen = sp - (sp - (sp_t) 1) / len; /* invlen = 1/len (mod sp) */ w1 = sp_mul (invlen, w1, sp, mul_c); w2 = sp_mul (invlen, w2, sp, mul_c); for (i = 0; i < 2 * n - 3; i += 3) { spv[i] = sp_mul (spv[i], invlen, sp, mul_c); spv[i + 1] = sp_mul (spv[i + 1], w2, sp, mul_c); spv[i + 2] = sp_mul (spv[i + 2], w1, sp, mul_c); } if (i < 2 * n - 1) spv[i] = sp_mul (spv[i], invlen, sp, mul_c); if (i < 2 * n - 2) spv[i + 1] = sp_mul (spv[i + 1], w2, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after un-weighting:", spv, len); #endif /* Separate the coefficients of R in the wrapped-around product. */ /* Set w1 = cuberoot(1)^l where cuberoot(1) is the same primitive 3rd root of unity we used for the weight signal */ w1 = sp_pow (spm->prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); w1 = sp_pow (w1, len % 3UL, sp, mul_c); /* Set w2 = 1/(w1 - 1/w1). Incidentally, w2 = 1/sqrt(-3) */ w2 = sp_inv (w1, sp, mul_c); w2 = sp_sub (w1, w2, sp); w2 = sp_inv (w2, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) printf ("For separating: w1 = %lu, w2 = %lu\n", w1, w2); #endif for (i = len - (2*n - 2); i <= len / 2; i++) { sp_t t, u; /* spv[i] = s_i + w^{-l} s_{l-i}. spv[l-i] = s_{l-i} + w^{-l} s_i */ t = sp_mul (spv[i], w1, sp, mul_c); /* t = w^l s_i + s_{l-i} */ t = sp_sub (t, spv[len - i], sp); /* t = w^l s_i + w^{-l} s_i */ t = sp_mul (t, w2, sp, mul_c); /* t = s_1 */ u = sp_sub (spv[i], t, sp); /* u = w^{-l} s_{l-i} */ u = sp_mul (u, w1, sp, mul_c); /* u = s_{l-i} */ spv[i] = t; spv[len - i] = u; ASSERT(i < len / 2 || t == u); } #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after un-wrapping:", spv, len); #endif } #ifdef _OPENMP } #endif } ecm-6.4.4/acinclude.m40000644023561000001540000002514312106741274011415 00000000000000dnl Various routines adapted from gmp-4.1.4 define(X86_PATTERN, [[i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-* | viac3*-*-*]]) dnl GMP_INIT([M4-DEF-FILE]) dnl ----------------------- dnl Initializations for GMP config.m4 generation. dnl dnl FIXME: The generated config.m4 doesn't get recreated by config.status. dnl Maybe the relevant "echo"s should go through AC_CONFIG_COMMANDS. AC_DEFUN([GMP_INIT], [ifelse([$1], , gmp_configm4=config.m4, gmp_configm4="[$1]") gmp_tmpconfigm4=cnfm4.tmp gmp_tmpconfigm4i=cnfm4i.tmp gmp_tmpconfigm4p=cnfm4p.tmp rm -f $gmp_tmpconfigm4 $gmp_tmpconfigm4i $gmp_tmpconfigm4p ]) dnl GMP_FINISH dnl ---------- dnl Create config.m4 from its accumulated parts. dnl dnl __CONFIG_M4_INCLUDED__ is used so that a second or subsequent include dnl of config.m4 is harmless. dnl dnl A separate ifdef on the angle bracket quoted part ensures the quoting dnl style there is respected. The basic defines from gmp_tmpconfigm4 are dnl fully quoted but are still put under an ifdef in case any have been dnl redefined by one of the m4 include files. dnl dnl Doing a big ifdef within asm-defs.m4 and/or other macro files wouldn't dnl work, since it'd interpret parentheses and quotes in dnl comments, and dnl having a whole file as a macro argument would overflow the string space dnl on BSD m4. AC_DEFUN([GMP_FINISH], [AC_REQUIRE([GMP_INIT]) echo "creating $gmp_configm4" echo ["d""nl $gmp_configm4. Generated automatically by configure."] > $gmp_configm4 if test -f $gmp_tmpconfigm4; then echo ["changequote(<,>)"] >> $gmp_configm4 echo ["ifdef(<__CONFIG_M4_INCLUDED__>,,<"] >> $gmp_configm4 cat $gmp_tmpconfigm4 >> $gmp_configm4 echo [">)"] >> $gmp_configm4 echo ["changequote(\`,')"] >> $gmp_configm4 rm $gmp_tmpconfigm4 fi echo ["ifdef(\`__CONFIG_M4_INCLUDED__',,\`"] >> $gmp_configm4 if test -f $gmp_tmpconfigm4i; then cat $gmp_tmpconfigm4i >> $gmp_configm4 rm $gmp_tmpconfigm4i fi if test -f $gmp_tmpconfigm4p; then cat $gmp_tmpconfigm4p >> $gmp_configm4 rm $gmp_tmpconfigm4p fi echo ["')"] >> $gmp_configm4 echo ["define(\`__CONFIG_M4_INCLUDED__')"] >> $gmp_configm4 ]) dnl GMP_PROG_M4 dnl ----------- dnl Find a working m4, either in $PATH or likely locations, and setup $M4 dnl and an AC_SUBST accordingly. If $M4 is already set then it's a user dnl choice and is accepted with no checks. GMP_PROG_M4 is like dnl AC_PATH_PROG or AC_CHECK_PROG, but tests each m4 found to see if it's dnl good enough. dnl dnl See mpn/asm-defs.m4 for details on the known bad m4s. AC_DEFUN([GMP_PROG_M4], [AC_ARG_VAR(M4,[m4 macro processor]) AC_CACHE_CHECK([for suitable m4], gmp_cv_prog_m4, [if test -n "$M4"; then gmp_cv_prog_m4="$M4" else cat >conftest.m4 <<\EOF dnl Must protect this against being expanded during autoconf m4! dnl Dont put "dnl"s in this as autoconf will flag an error for unexpanded dnl macros. [define(dollarhash,``$][#'')ifelse(dollarhash(x),1,`define(t1,Y)', ``bad: $][# not supported (SunOS /usr/bin/m4) '')ifelse(eval(89),89,`define(t2,Y)', `bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4) ')ifelse(t1`'t2,YY,`good ')] EOF dnl ' <- balance the quotes for emacs sh-mode echo "trying m4" >&AC_FD_CC gmp_tmp_val=`(m4 conftest.m4) 2>&AC_FD_CC` echo "$gmp_tmp_val" >&AC_FD_CC if test "$gmp_tmp_val" = good; then gmp_cv_prog_m4="m4" else IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" dnl $ac_dummy forces splitting on constant user-supplied paths. dnl POSIX.2 word splitting is done only on the output of word expansions, dnl not every word. This closes a longstanding sh security hole. ac_dummy="$PATH:/usr/5bin" for ac_dir in $ac_dummy; do test -z "$ac_dir" && ac_dir=. echo "trying $ac_dir/m4" >&AC_FD_CC gmp_tmp_val=`($ac_dir/m4 conftest.m4) 2>&AC_FD_CC` echo "$gmp_tmp_val" >&AC_FD_CC if test "$gmp_tmp_val" = good; then gmp_cv_prog_m4="$ac_dir/m4" break fi done IFS="$ac_save_ifs" if test -z "$gmp_cv_prog_m4"; then AC_MSG_ERROR([No usable m4 in \$PATH or /usr/5bin (see config.log for reasons).]) fi fi rm -f conftest.m4 fi]) M4="$gmp_cv_prog_m4" AC_SUBST(M4) ]) dnl GMP_DEFINE(MACRO, DEFINITION [, LOCATION]) dnl ------------------------------------------ dnl Define M4 macro MACRO as DEFINITION in temporary file. dnl dnl If LOCATION is `POST', the definition will appear after any include() dnl directives inserted by GMP_INCLUDE. Mind the quoting! No shell dnl variables will get expanded. Don't forget to invoke GMP_FINISH to dnl create file config.m4. config.m4 uses `<' and '>' as quote characters dnl for all defines. AC_DEFUN([GMP_DEFINE], [AC_REQUIRE([GMP_INIT]) echo ['define(<$1>, <$2>)'] >>ifelse([$3], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4) ]) dnl GMP_TRY_ASSEMBLE(asm-code,[action-success][,action-fail]) dnl ---------------------------------------------------------- dnl Attempt to assemble the given code. dnl Do "action-success" if this succeeds, "action-fail" if not. dnl dnl conftest.o and conftest.out are available for inspection in dnl "action-success". If either action does a "break" out of a loop then dnl an explicit "rm -f conftest*" will be necessary. dnl dnl This is not unlike AC_TRY_COMPILE, but there's no default includes or dnl anything in "asm-code", everything wanted must be given explicitly. AC_DEFUN([GMP_TRY_ASSEMBLE], [cat >conftest.s <&AC_FD_CC ifelse([$2],,:,[$2]) else cat conftest.out >&AC_FD_CC echo "configure: failed program was:" >&AC_FD_CC cat conftest.s >&AC_FD_CC ifelse([$3],,:,[$3]) fi rm -f conftest* ]) dnl GMP_ASM_TYPE dnl ------------ dnl Can we say ".type", and how? dnl dnl For i386 GNU/Linux ELF systems, and very likely other ELF systems, dnl .type and .size are important on functions in shared libraries. If dnl .type is omitted and the mainline program references that function then dnl the code will be copied down to the mainline at load time like a piece dnl of data. If .size is wrong or missing (it defaults to 4 bytes or some dnl such) then incorrect bytes will be copied and a segv is the most likely dnl result. In any case such copying is not what's wanted, a .type dnl directive will ensure a PLT entry is used. dnl dnl In GMP the assembler functions are normally only used from within the dnl library (since most programs are not interested in the low level dnl routines), and in those circumstances a missing .type isn't fatal, dnl letting the problem go unnoticed. tests/mpn/t-asmtype.c aims to check dnl for it. AC_DEFUN([GMP_ASM_TYPE], [AC_CACHE_CHECK([for assembler .type directive], gmp_cv_asm_type, [gmp_cv_asm_type= for gmp_tmp_prefix in @ \# %; do GMP_TRY_ASSEMBLE([ .type sym,${gmp_tmp_prefix}function], [if grep "\.type pseudo-op used outside of \.def/\.endef ignored" conftest.out >/dev/null; then : ; else gmp_cv_asm_type=".type \$][1,${gmp_tmp_prefix}\$][2" break fi]) done rm -f conftest* ]) echo ["define(, <$gmp_cv_asm_type>)"] >> $gmp_tmpconfigm4 ]) dnl GMP_ASM_GLOBL dnl ------------- dnl Can we say `.global'? AC_DEFUN([GMP_ASM_GLOBL], [AC_CACHE_CHECK([how to export a symbol], gmp_cv_asm_globl, [case $host in *-*-hpux*) gmp_cv_asm_globl=".export" ;; *) gmp_cv_asm_globl=".globl" ;; esac ]) echo ["define(, <$gmp_cv_asm_globl>)"] >> $gmp_tmpconfigm4 ]) dnl GMP_ASM_TEXT dnl ------------ AC_DEFUN([GMP_ASM_TEXT], [AC_CACHE_CHECK([how to switch to text section], gmp_cv_asm_text, [case $host in *-*-aix*) gmp_cv_asm_text=[".csect .text[PR]"] ;; *-*-hpux*) gmp_cv_asm_text=".code" ;; *) gmp_cv_asm_text=".text" ;; esac ]) echo ["define(, <$gmp_cv_asm_text>)"] >> $gmp_tmpconfigm4 ]) dnl GMP_ASM_LABEL_SUFFIX dnl -------------------- dnl Should a label have a colon or not? AC_DEFUN([GMP_ASM_LABEL_SUFFIX], [AC_CACHE_CHECK([what assembly label suffix to use], gmp_cv_asm_label_suffix, [case $host in # Empty is only for the HP-UX hppa assembler; hppa gas requires a colon. *-*-hpux*) gmp_cv_asm_label_suffix= ;; *) gmp_cv_asm_label_suffix=: ;; esac ]) echo ["define(, <\$][1$gmp_cv_asm_label_suffix>)"] >> $gmp_tmpconfigm4 ]) dnl ECM_INCLUDE(FILE) dnl --------------------- dnl Add an include_mpn() to config.m4. FILE should be a path dnl relative to the main source directory, for example dnl dnl ECM_INCLUDE(`powerpc64/defs.m4') dnl AC_DEFUN([ECM_INCLUDE], [AC_REQUIRE([GMP_INIT]) echo ["include($1)"] >> $gmp_tmpconfigm4 ]) dnl GMP_ASM_UNDERSCORE dnl ------------------ dnl Determine whether global symbols need to be prefixed with an underscore. dnl A test program is linked to an assembler module with or without an dnl underscore to see which works. dnl dnl This method should be more reliable than grepping a .o file or using dnl nm, since it corresponds to what a real program is going to do. Note dnl in particular that grepping doesn't work with SunOS 4 native grep since dnl that grep seems to have trouble with '\0's in files. AC_DEFUN([GMP_ASM_UNDERSCORE], [AC_REQUIRE([GMP_ASM_TEXT]) AC_REQUIRE([GMP_ASM_GLOBL]) AC_REQUIRE([GMP_ASM_LABEL_SUFFIX]) AC_CACHE_CHECK([if globals are prefixed by underscore], gmp_cv_asm_underscore, [cat >conftes1.c <conftes2.s <>conftes2.s < #include #include /* GMP header file */ #include "ecm.h" /* ecm header file */ int main (int argc, char *argv[]) { mpz_t n, f; int res; double B1; if (argc != 3) { fprintf (stderr, "Usage: ecmfactor \n"); exit (1); } mpz_init (n); /* read number on command line */ if (mpz_set_str (n, argv[1], 10)) { fprintf (stderr, "Invalid number: %s\n", argv[1]); exit (1); } B1 = atof (argv[2]); mpz_init (f); /* for potential factor */ printf ("Performing one curve with B1=%1.0f\n", B1); res = ecm_factor (f, n, B1, NULL); if (res > 0) { printf ("found factor in step %u: ", res); mpz_out_str (stdout, 10, f); printf ("\n"); #if 0 printf ("lucky curve was b*y^2 = x^3 + a*x^2 + x\n"); printf ("with a = (v-u)^3*(3*u+v)/(4*u^3*v)-2,"); printf (" u = sigma^2-5, v = 4*sigma\n"); #endif } else if (res == ECM_NO_FACTOR_FOUND) printf ("found no factor\n"); else printf ("error\n"); mpz_clear (f); mpz_clear (n); return 0; } ecm-6.4.4/ChangeLog0000644023561000001540000135304412113421551010772 00000000000000------------------------------------------------------------------------ r2438 | kruppa | 2013-02-27 16:16:07 +0100 (Wed, 27 Feb 2013) | 3 lines Remove -t option from man page Makefile should look for ecm.xml in $(source)/ ------------------------------------------------------------------------ r2436 | kruppa | 2013-02-26 19:46:56 +0100 (Tue, 26 Feb 2013) | 2 lines Replaced several alloca() by malloc() to avoid segfault with very large P+-1 stage 2 ------------------------------------------------------------------------ r2435 | kruppa | 2013-02-26 19:44:05 +0100 (Tue, 26 Feb 2013) | 2 lines Removed memory lead due to surplus mpres_init() ------------------------------------------------------------------------ r2434 | zimmerma | 2013-02-22 15:07:14 +0100 (Fri, 22 Feb 2013) | 2 lines [main.c] removed -t option (should have been removed in r1860) ------------------------------------------------------------------------ r2433 | kruppa | 2013-02-22 13:26:09 +0100 (Fri, 22 Feb 2013) | 3 lines Use malloc() instead of alloca() for tmp in mpn_fft_fft_bailey_decompose() to avoid segfault with very large stage 2 ------------------------------------------------------------------------ r2422 | zimmerma | 2013-02-19 21:04:41 +0100 (Tue, 19 Feb 2013) | 2 lines [champions.h] updated for ECM ------------------------------------------------------------------------ r2421 | zimmerma | 2013-02-19 21:02:29 +0100 (Tue, 19 Feb 2013) | 2 lines [ChangeLog] updated ------------------------------------------------------------------------ r2419 | zimmerma | 2013-02-19 20:58:19 +0100 (Tue, 19 Feb 2013) | 2 lines [INSTALL-ecm] updated ------------------------------------------------------------------------ r2417 | zimmerma | 2013-02-19 20:42:03 +0100 (Tue, 19 Feb 2013) | 2 lines [NEWS] updated ------------------------------------------------------------------------ r2416 | kruppa | 2013-02-19 19:42:04 +0100 (Tue, 19 Feb 2013) | 5 lines Merged r1971 from trunk: applied patch from Leif Leonhardy to make the assembly code work with --enable-shared (see http://trac.sagemath.org/sage_trac/ticket/11705) ------------------------------------------------------------------------ r2414 | kruppa | 2013-02-19 17:36:35 +0100 (Tue, 19 Feb 2013) | 2 lines Updated changelog ------------------------------------------------------------------------ r2413 | kruppa | 2013-02-19 16:15:15 +0100 (Tue, 19 Feb 2013) | 2 lines Test was backwards :( ------------------------------------------------------------------------ r2412 | kruppa | 2013-02-19 16:10:27 +0100 (Tue, 19 Feb 2013) | 2 lines Define correct __gmpn_redc_{12} prototype for GMP <5.1. Define REDC{12} macros only if the functions exist. ------------------------------------------------------------------------ r2411 | kruppa | 2013-02-19 13:29:43 +0100 (Tue, 19 Feb 2013) | 2 lines Use defined() for HAVE_ALLOCA_H test ------------------------------------------------------------------------ r2407 | kruppa | 2013-02-15 14:22:50 +0100 (Fri, 15 Feb 2013) | 2 lines MinGW has alloca() prototype in malloc.h ------------------------------------------------------------------------ r2406 | kruppa | 2013-02-14 13:08:03 +0100 (Thu, 14 Feb 2013) | 3 lines Merging r2344: Remove stray '$' ------------------------------------------------------------------------ r2405 | kruppa | 2013-02-13 19:18:14 +0100 (Wed, 13 Feb 2013) | 2 lines Update version numbers for 6.4.4 ------------------------------------------------------------------------ r2404 | kruppa | 2013-02-13 18:31:09 +0100 (Wed, 13 Feb 2013) | 2 lines Fixed typo ------------------------------------------------------------------------ r2402 | kruppa | 2013-02-13 17:55:16 +0100 (Wed, 13 Feb 2013) | 2 lines Merge r2401 into 6.4.4 branch ------------------------------------------------------------------------ r2400 | kruppa | 2013-02-13 17:49:56 +0100 (Wed, 13 Feb 2013) | 2 lines Merge of commit r2320 ------------------------------------------------------------------------ r2399 | kruppa | 2013-02-13 17:42:57 +0100 (Wed, 13 Feb 2013) | 1 line Branch for patchlevel release 6.4.4 ------------------------------------------------------------------------ r2096 | kruppa | 2012-06-12 17:35:10 +0200 (Tue, 12 Jun 2012) | 2 lines Changed paths: M /branches/6.4.3/build.vc10/Makefile.am Removed bench_mulredc from EXTRA_DIST to avoid including .svn dir in distribution ------------------------------------------------------------------------ r2095 | kruppa | 2012-06-12 17:26:17 +0200 (Tue, 12 Jun 2012) | 2 lines Changed paths: M /branches/6.4.3/TODO M /branches/6.4.3/ecm.1 M /branches/6.4.3/ecm.xml Remove incorrect hypen in "Peter Lawrence Montgomery" ------------------------------------------------------------------------ r2094 | brian_gladman | 2012-06-12 17:23:55 +0200 (Tue, 12 Jun 2012) | 1 line Changed paths: M /branches/6.4.3/build.vc10/tests.py Add a few more tests that I missed earlier ------------------------------------------------------------------------ r2093 | brian_gladman | 2012-06-12 16:45:25 +0200 (Tue, 12 Jun 2012) | 1 line Changed paths: M /branches/6.4.3/build.vc10/config.h M /branches/6.4.3/build.vc10/readme.txt M /branches/6.4.3/build.vc10/tests.py Minor update to Windows files and add new tests ------------------------------------------------------------------------ r2090 | kruppa | 2012-06-12 14:48:43 +0200 (Tue, 12 Jun 2012) | 2 lines Changed paths: M /trunk/test.pm1 Added test for bug fixed in revision 2068 ------------------------------------------------------------------------ r2088 | kruppa | 2012-06-12 14:16:03 +0200 (Tue, 12 Jun 2012) | 2 lines Changed paths: M /trunk/NEWS Updated for 6.4.3 release ------------------------------------------------------------------------ r2082 | kruppa | 2012-06-07 14:20:20 +0200 (Thu, 07 Jun 2012) | 2 lines Changed paths: M /trunk/cudawrapper.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/factor.c M /trunk/main.c M /trunk/random.c Make random value more 64-bit-like, following suggestion by Jayson King ------------------------------------------------------------------------ r2072 | zimmerma | 2012-06-04 10:33:24 +0200 (Mon, 04 Jun 2012) | 2 lines Changed paths: M /trunk/NEWS [NEWS] added item ------------------------------------------------------------------------ r2068 | kruppa | 2012-06-01 23:15:44 +0200 (Fri, 01 Jun 2012) | 2 lines Changed paths: M /trunk/mpzspm.c Replace mpz_init_set_ui() by mpz_set_sp() so it works under Windows ------------------------------------------------------------------------ r1878 | zimmerma | 2012-03-19 10:11:57 +0100 (Mon, 19 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 M /trunk/ecm-params.h.corei5 updated default tuning parameters ------------------------------------------------------------------------ r1877 | bouvierc | 2012-03-18 22:31:30 +0100 (Sun, 18 Mar 2012) | 2 lines Changed paths: D /trunk/gpu/gpu_ecm_cc13 Remove old gpu code. ------------------------------------------------------------------------ r1876 | brian_gladman | 2012-03-17 22:42:08 +0100 (Sat, 17 Mar 2012) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm/ecm.vcxproj.filters remove trial.c from VC++ build ------------------------------------------------------------------------ r1875 | zimmerma | 2012-03-17 10:36:43 +0100 (Sat, 17 Mar 2012) | 3 lines Changed paths: M /trunk/NEWS M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/sp.h M /trunk/spm.c implement new LSB reduction of residues mod small primes in mpzspv.c, yields significant speedup ------------------------------------------------------------------------ r1874 | zimmerma | 2012-03-17 08:08:46 +0100 (Sat, 17 Mar 2012) | 3 lines Changed paths: M /trunk/batch.c M /trunk/ellparam_batch.c [batch.c] fixed copyright years and typo [ellparam_batch.c] fixed copyright years ------------------------------------------------------------------------ r1873 | bouvierc | 2012-03-16 18:38:41 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ellparam_batch.c Wrong names in the Copyright. ------------------------------------------------------------------------ r1872 | zimmerma | 2012-03-16 18:35:39 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: D /trunk/gpu/getprime.c D /trunk/gpu/getprime.h D /trunk/gpu/makefile D /trunk/gpu/modular_arithmetic.c D /trunk/gpu/modular_arithmetic.h D /trunk/gpu/prototype.c D /trunk/gpu/prototype.h D /trunk/gpu/stage1-c.c D /trunk/gpu/stage1.c removed obsolete files ------------------------------------------------------------------------ r1871 | bouvierc | 2012-03-16 18:08:34 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ellparam_batch.c M /trunk/main.c Add licence in batch.c and ellparam_batch.c ------------------------------------------------------------------------ r1867 | zimmerma | 2012-03-16 17:03:40 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: M /trunk/Fgw.c M /trunk/Makefile.am M /trunk/NEWS M /trunk/auxarith.c M /trunk/auxi.c M /trunk/auxlib.c M /trunk/b1_ainc.c M /trunk/bestd.c M /trunk/build.vc10/config.h M /trunk/candi.c M /trunk/configure.in D /trunk/countsmooth.c M /trunk/ecm-ecm.h M /trunk/ecm-gmp.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/ecm2.c M /trunk/ecm_ntt.c M /trunk/ecmfactor.c M /trunk/eval.c M /trunk/factor.c M /trunk/getprime.c M /trunk/gpu/getprime.c M /trunk/gpu/modular_arithmetic.c M /trunk/gpu/prototype.c M /trunk/gpu/stage1-c.c M /trunk/gpu/stage1.c M /trunk/ks-multiply.c M /trunk/listz.c M /trunk/lucas.c M /trunk/main.c M /trunk/median.c M /trunk/memory.c M /trunk/mpmod.c M /trunk/mpmod.h M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/mul_fft.c M /trunk/mul_lo.c A /trunk/nodist/rho.gp (from /trunk/rho.gp:1864) M /trunk/ntt_gfp.c M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/polyeval.c M /trunk/powerpc64/mulredc.m4 M /trunk/powerpc64/mulredc_1_2.m4 M /trunk/powerpc64/redc.asm M /trunk/pp1.c M /trunk/random.c M /trunk/resume.c M /trunk/rho.c D /trunk/rho.gp D /trunk/runecm2.c M /trunk/schoen_strass.c M /trunk/sets_long.c M /trunk/sp.c M /trunk/sp.h M /trunk/spm.c M /trunk/spv.c M /trunk/stage2.c M /trunk/test.ecm M /trunk/test.pm1 M /trunk/test.pp1 M /trunk/testlong.pp1 M /trunk/toomcook.c M /trunk/tune.c updated the copyright headers to GPL 3 and LGPL 3 ------------------------------------------------------------------------ r1866 | zimmerma | 2012-03-16 16:05:59 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: A /trunk/nodist/countsmooth.c (from /trunk/countsmooth.c:1864) A /trunk/nodist/runecm2.c (from /trunk/runecm2.c:1864) moved countsmooth.c and runecm2.c to nodist ------------------------------------------------------------------------ r1865 | zimmerma | 2012-03-16 16:04:58 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: A /trunk/nodist [nodist] new directory for non-distributed files ------------------------------------------------------------------------ r1864 | bouvierc | 2012-03-16 14:50:33 +0100 (Fri, 16 Mar 2012) | 3 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/utils.h Forgot to commit the new Makefile! Now include also directly ecm-ecm.h in gpu/gpu_ecm/utils.h ------------------------------------------------------------------------ r1863 | bouvierc | 2012-03-16 14:37:12 +0100 (Fri, 16 Mar 2012) | 7 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.h M /trunk/random.c Rewriting Makefile of GPUECM to separate libecm file from ecm-ecm file. [Makefile] Separate file from libecm. Goal: linking directly libecm. [batch.c] Now only include ecm-impl.h [random.c] No outputf in GPUECM [main.c] change #define to avoid conflicts ------------------------------------------------------------------------ r1862 | zimmerma | 2012-03-16 10:21:32 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: D /trunk/ecmfactor2.c [ecmfactor2.c] removed unmaintained program ------------------------------------------------------------------------ r1861 | zimmerma | 2012-03-16 10:16:14 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: D /trunk/bestdaux.c [bestdaux.c] removed unused file ------------------------------------------------------------------------ r1860 | zimmerma | 2012-03-16 10:05:05 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: D /trunk/trial.c [trial.c] removed unmaintained and untested file ------------------------------------------------------------------------ r1859 | zimmerma | 2012-03-16 09:39:46 +0100 (Fri, 16 Mar 2012) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/mpmod.c M /trunk/test.ecm [test.ecm] added one test [mpmod.c] added an ASSERT [Makefile.am] missing tab ------------------------------------------------------------------------ r1858 | zimmerma | 2012-03-16 08:51:11 +0100 (Fri, 16 Mar 2012) | 2 lines Changed paths: M /trunk/tune.c [tune.c] removed trailing blank ------------------------------------------------------------------------ r1857 | zimmerma | 2012-03-16 08:13:22 +0100 (Fri, 16 Mar 2012) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/test.pm1 M /trunk/test.pp1 [test.pp1] added one test and removed blank line at the end [test.pm1] removed blank line at the end [Makefile.am] added blank line between tests ------------------------------------------------------------------------ r1856 | bouvierc | 2012-03-15 19:05:05 +0100 (Thu, 15 Mar 2012) | 2 lines Changed paths: M /trunk/batch.c Forgot to remove unused variables. ------------------------------------------------------------------------ r1855 | zimmerma | 2012-03-15 19:00:20 +0100 (Thu, 15 Mar 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] fixed wrong patch in r1851 ------------------------------------------------------------------------ r1854 | bouvierc | 2012-03-15 16:01:21 +0100 (Thu, 15 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Changes in Cuda_Dbl_mod ------------------------------------------------------------------------ r1853 | bouvierc | 2012-03-15 13:29:24 +0100 (Thu, 15 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Rewrite Normalize function. ------------------------------------------------------------------------ r1852 | bouvierc | 2012-03-15 13:27:47 +0100 (Thu, 15 Mar 2012) | 2 lines Changed paths: M /trunk/batch.c Rewrite dup_add_batch2 with 6 residues (like dup_add_batch1) ------------------------------------------------------------------------ r1851 | zimmerma | 2012-03-15 12:09:28 +0100 (Thu, 15 Mar 2012) | 5 lines Changed paths: M /trunk/batch.c M /trunk/mpmod.c [mpmod.c] fixed bug in ecm_redc_n (found on gcc45 and gcc61), probably a new release 6.4.2 is needed [batch.c] reduced number of auxiliary variables from 5 to 2 in dup_add_batch1 (remains to do the same in dup_add_batch2) ------------------------------------------------------------------------ r1850 | zimmerma | 2012-03-15 08:08:05 +0100 (Thu, 15 Mar 2012) | 4 lines Changed paths: M /trunk/NEWS M /trunk/build.vc10/config.h M /trunk/configure.in M /trunk/main.c M /trunk/test.ecm [configure.in,build.vc10/config.h] bump version to 7.0-dev [main.c] make batch=1 mode the default one for ECM [test.ecm] added -batch=0 where needed ------------------------------------------------------------------------ r1847 | zimmerma | 2012-03-14 23:19:47 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/mpzspv.c [mpzspv.c] added note about possible improvement in mpzspv_from_mpzv_slow() ------------------------------------------------------------------------ r1846 | zimmerma | 2012-03-14 20:38:17 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/README.dev [README.dev] added tag for 6.4.1 release ------------------------------------------------------------------------ r1844 | zimmerma | 2012-03-14 18:29:22 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/ChangeLog M /trunk/build.vc10/config.h M /trunk/configure.in This will be the 6.4.1 release (if all final tests pass) ------------------------------------------------------------------------ r1842 | zimmerma | 2012-03-14 18:25:52 +0100 (Wed, 14 Mar 2012) | 5 lines Changed paths: M /trunk/configure.in M /trunk/ecm-impl.h M /trunk/ecm-params.h.hppa2.0 M /trunk/ecm-params.h.ia64 M /trunk/ecm-params.h.mips64el M /trunk/ecm-params.h.powerpc970 M /trunk/ecm-params.h.sparc64 updated various tuning parameters [configure.in] tuning parameters for ia64 and hppa2.0 were not used! Also fixed check for MPIR [ecm-impl.h] fixed // comment ------------------------------------------------------------------------ r1840 | zimmerma | 2012-03-14 16:46:05 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/ecm.c [ecm.c] removed computation of number of MULs and SQRs, to save a few cycles ------------------------------------------------------------------------ r1839 | bouvierc | 2012-03-14 15:47:08 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Change in modular substraction. ------------------------------------------------------------------------ r1838 | zimmerma | 2012-03-14 15:35:59 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/tune.c [tune.c] check for failed memory allocation ------------------------------------------------------------------------ r1837 | bouvierc | 2012-03-14 15:28:56 +0100 (Wed, 14 Mar 2012) | 4 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Add specific code for mul for CC 2.0 cards. Treat differently access to constant variables. -> results in significant speed-up. ------------------------------------------------------------------------ r1836 | bouvierc | 2012-03-14 15:04:39 +0100 (Wed, 14 Mar 2012) | 3 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Rewriting addition, substraction and multiplication by 2. No comparison is needed anymore so Cuda_Cmp is removed. ------------------------------------------------------------------------ r1835 | bouvierc | 2012-03-14 14:53:30 +0100 (Wed, 14 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu Rewrite #define for asm statement. ------------------------------------------------------------------------ r1834 | zimmerma | 2012-03-14 09:27:53 +0100 (Wed, 14 Mar 2012) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/bench_mulredc.c [bench_mulredc.c] only print to stdout what goes to ecm-params.h so that we can do ./bench_mulredc >> ecm-params.h [Makefile.am] build bench_mulredc by default ------------------------------------------------------------------------ r1833 | zimmerma | 2012-03-13 21:04:49 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.core2 [ecm-params.h.core2] updated ------------------------------------------------------------------------ r1832 | zimmerma | 2012-03-13 18:19:44 +0100 (Tue, 13 Mar 2012) | 3 lines Changed paths: M /trunk/ChangeLog M /trunk/build.vc10/config.h M /trunk/configure.in [configure.in,build.vc10/config.h] changed version to 6.4.1-rc3 [ChangeLog] updated ------------------------------------------------------------------------ r1831 | zimmerma | 2012-03-13 13:47:38 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/mpzspv.c [mpzspv.c] removed call to ecm_bdiv_r_1 since it is not working ------------------------------------------------------------------------ r1830 | zimmerma | 2012-03-13 13:15:09 +0100 (Tue, 13 Mar 2012) | 5 lines Changed paths: M /trunk/ecm-params.h.corei5 M /trunk/mpzspv.c [ecm-params.h.corei5] updated [mpzspv.c] removed new code with mpn_preinv_mod_1() in mpzspv_from_mpzv_slow since it was wrong (added comment explaining why) added new ecm_bdiv_r_1 code (disabled for now) ------------------------------------------------------------------------ r1829 | brian_gladman | 2012-03-13 10:22:16 +0100 (Tue, 13 Mar 2012) | 1 line Changed paths: M /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj M /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj.filters M /trunk/sp.h ensure ATTRIBUTE_UNUSED is defined as empty for MSVC in sp.h ------------------------------------------------------------------------ r1828 | bouvierc | 2012-03-13 09:47:07 +0100 (Tue, 13 Mar 2012) | 3 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/cudakernel.cu [cudakernel.cu] Fix a bug [Makefile] Add 32-bit lib for CUDA ------------------------------------------------------------------------ r1827 | brian_gladman | 2012-03-13 09:43:40 +0100 (Tue, 13 Mar 2012) | 1 line Changed paths: M /trunk/build.vc10/bench_mulredc D /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj.user remove file uploaded in error ------------------------------------------------------------------------ r1826 | zimmerma | 2012-03-13 09:24:34 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 [ecm-params.h.athlon64] redo tuning ------------------------------------------------------------------------ r1825 | zimmerma | 2012-03-13 08:46:17 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/README M /trunk/bench_mulredc.c changes suggested by David Cleaver for Windows+MingW64+Msys ------------------------------------------------------------------------ r1824 | zimmerma | 2012-03-13 08:30:42 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/sp.h [sp.h] removed more compiler warnings (on gcc110) ------------------------------------------------------------------------ r1823 | zimmerma | 2012-03-13 08:17:49 +0100 (Tue, 13 Mar 2012) | 2 lines Changed paths: M /trunk/eval.c [eval.c] fixed compiler warnings (found on gcc70) ------------------------------------------------------------------------ r1822 | zimmerma | 2012-03-12 23:00:07 +0100 (Mon, 12 Mar 2012) | 4 lines Changed paths: M /trunk/configure.in M /trunk/mpzspv.c M /trunk/spm.c [mpzspv.c] use __gmpn_preinv_mod_1 in mpzspv_from_mpzv_slow() [configure.in] recognize __gmpn_preinv_mod_1 [spm.c] fixed typo ------------------------------------------------------------------------ r1821 | zimmerma | 2012-03-12 19:59:40 +0100 (Mon, 12 Mar 2012) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] modified batch1 test which was not working on 32-bit ------------------------------------------------------------------------ r1820 | zimmerma | 2012-03-12 18:25:09 +0100 (Mon, 12 Mar 2012) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] added two non-regression tests for bug fixed by r1819 ------------------------------------------------------------------------ r1819 | zimmerma | 2012-03-12 17:43:22 +0100 (Mon, 12 Mar 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] fixed bug in mpresn_addsub (found on gcc45) ------------------------------------------------------------------------ r1818 | zimmerma | 2012-03-12 16:18:22 +0100 (Mon, 12 Mar 2012) | 5 lines Changed paths: M /trunk/bench_mulredc.c [bench_mulredc.c] use same cputime() function as in auxlib.c (should work on Windows too) and setup number of iterations to get about 100ms for each test (avoid long time on slow computers) ------------------------------------------------------------------------ r1817 | zimmerma | 2012-03-12 14:48:29 +0100 (Mon, 12 Mar 2012) | 7 lines Changed paths: M /trunk/COPYING M /trunk/COPYING.LIB M /trunk/Makefile.am M /trunk/NEWS M /trunk/bench_mulredc.c M /trunk/longlong.h M /trunk/mpmod.c M /trunk/pm1fs2.c [COPYING,COPYING.LIB] switched to GPL v3 and LGPL v3 [bench_mulredc.c,Makefile.am] fix for --disable-asm-redc [NEWS] added new items [longlong.h] copied umul_ppmm code for MIPS from GMP 5.0.4 [mpmod.c] now use mulredc_1 when available [pm1fs2.c] fixed issue when sizeof(unsigned long) < sizeof(sp_t) ------------------------------------------------------------------------ r1816 | brian_gladman | 2012-03-12 12:28:43 +0100 (Mon, 12 Mar 2012) | 1 line Changed paths: D /trunk/build.vc10/ecm.cuda.sln A /trunk/build.vc10/gpu_ecm.sln (from /trunk/build.vc10/ecm.cuda.sln:1814) rename the Visual Studio solution for the gpu build to gpu_ecm ------------------------------------------------------------------------ r1815 | brian_gladman | 2012-03-12 12:21:16 +0100 (Mon, 12 Mar 2012) | 1 line Changed paths: A /trunk/build.vc10/getopt.c A /trunk/build.vc10/getopt.h add files needed for the gpu build with MS Visual Studio and Nvidia Nsight ------------------------------------------------------------------------ r1814 | brian_gladman | 2012-03-12 12:05:58 +0100 (Mon, 12 Mar 2012) | 1 line Changed paths: M /trunk/build.vc10/config.h update to match MPIR 2.5.1 (which now has mpn_redc_2) ------------------------------------------------------------------------ r1813 | zimmerma | 2012-03-12 09:25:24 +0100 (Mon, 12 Mar 2012) | 2 lines Changed paths: M /trunk/bench_mulredc.c [bench_mulredc.c] fixed compiler warning ------------------------------------------------------------------------ r1809 | zimmerma | 2012-03-11 18:55:19 +0100 (Sun, 11 Mar 2012) | 3 lines Changed paths: M /trunk/batch.c M /trunk/build.vc10/config.h M /trunk/configure.in [batch.c] fixed bug since d_1 might not fit in an "unsigned long" on Windows [configure.in,build.vc10/config.h] bump version number to 6.4.1-rc2 ------------------------------------------------------------------------ r1808 | zimmerma | 2012-03-10 11:47:44 +0100 (Sat, 10 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 M /trunk/ecm-params.h.corei5 added tuning parameters for MPIR ------------------------------------------------------------------------ r1807 | zimmerma | 2012-03-10 11:26:20 +0100 (Sat, 10 Mar 2012) | 2 lines Changed paths: M /trunk/configure.in [configure.in] recognize if GMP is MPIR ------------------------------------------------------------------------ r1806 | zimmerma | 2012-03-10 10:49:20 +0100 (Sat, 10 Mar 2012) | 3 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/mpmod.c [mpmod.c] new function mpresn_unpad to normalize mpz_t values [batch.c] don't forget to normalize x1 and z1 at the end!!! ------------------------------------------------------------------------ r1805 | zimmerma | 2012-03-10 10:33:46 +0100 (Sat, 10 Mar 2012) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] fixed for new batch1 reduction ------------------------------------------------------------------------ r1804 | zimmerma | 2012-03-09 22:50:17 +0100 (Fri, 09 Mar 2012) | 2 lines Changed paths: M /trunk/README M /trunk/bench_mulredc.c M /trunk/configure.in M /trunk/ecm-gmp.h M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 M /trunk/ecm-params.h.corei5 M /trunk/mpmod.c M /trunk/mpmod.h M /trunk/test_mulredc.c M /trunk/x86_64/Makefile.am removed ecm_redc3 code from x86_64 (variable-size REDC assembly code) ------------------------------------------------------------------------ r1802 | zimmerma | 2012-03-09 18:21:09 +0100 (Fri, 09 Mar 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] fixed error when __gmpn_add_nc is not defined ------------------------------------------------------------------------ r1801 | zimmerma | 2012-03-09 18:13:48 +0100 (Fri, 09 Mar 2012) | 5 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/main.c M /trunk/mpmod.c M /trunk/test.ecm Change the definition of d in batch1 mode from mp_limb_t to mp_limb_t/B so that we can use LSB (Montgomery) division instead of MSB division, this gives a nice speedup. However this changes the input parameter A, and some tests are still failing. Do not use this version in production! ------------------------------------------------------------------------ r1800 | bouvierc | 2012-03-09 15:21:35 +0100 (Fri, 09 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile Dependencies in Makefile for GPU-ECM ------------------------------------------------------------------------ r1799 | zimmerma | 2012-03-09 10:49:47 +0100 (Fri, 09 Mar 2012) | 6 lines Changed paths: M /trunk/README.dev M /trunk/batch.c M /trunk/configure.in M /trunk/ecm-impl.h M /trunk/mpmod.c fixed various problems with ecm-6.4.1-rc1 reported by David Cleaver (http://lists.gforge.inria.fr/pipermail/ecm-discuss/2012-March/004144.html): * replaced unsigned long by mp_limb_t in batch=1 (under Windows, unsigned long has 32 bits only) * configure.in: added -lm for mathematical functions ------------------------------------------------------------------------ r1798 | zimmerma | 2012-03-08 13:11:00 +0100 (Thu, 08 Mar 2012) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] GMPLIB is not needed for libecm ------------------------------------------------------------------------ r1797 | zimmerma | 2012-03-08 10:11:52 +0100 (Thu, 08 Mar 2012) | 2 lines Changed paths: M /trunk/sets_long.c [sets_long.c] added comments ------------------------------------------------------------------------ r1796 | zimmerma | 2012-03-08 08:40:29 +0100 (Thu, 08 Mar 2012) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] forgot ecm-params.h.corei5 ------------------------------------------------------------------------ r1795 | zimmerma | 2012-03-07 16:41:21 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/NEWS [NEWS] fixed typo ------------------------------------------------------------------------ r1794 | zimmerma | 2012-03-07 16:36:37 +0100 (Wed, 07 Mar 2012) | 7 lines Changed paths: M /trunk/ChangeLog M /trunk/INSTALL-ecm M /trunk/NEWS M /trunk/README.dev M /trunk/bench_mulredc.c M /trunk/ecmbench [bench_mulredc.c] fix for Svoboda [NEWS] updated for 6.4.1 [ChangeLog] idem [INSTALL-ecm] ibidem [ecmbench] rm -> rm -f [README.dev] updated ------------------------------------------------------------------------ r1793 | bouvierc | 2012-03-07 15:12:28 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile Some changes in the Makefile for GPU-ECM ------------------------------------------------------------------------ r1792 | zimmerma | 2012-03-07 15:11:10 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.powerpc970 [ecm-params.h.powerpc970] removed exec flag ------------------------------------------------------------------------ r1791 | zimmerma | 2012-03-07 15:02:14 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c [ecm-impl.h,mpmod.c] removed mult_modulus (was always equal to orig_modulus) ------------------------------------------------------------------------ r1790 | zimmerma | 2012-03-07 14:22:50 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/pm1fs2.c [pm1fs2.c] changed assert to avoid compiler warning with clang 2.9 ------------------------------------------------------------------------ r1789 | zimmerma | 2012-03-07 13:54:25 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/configure.in [configure.in] added missing build.vc10/bench_mulredc/Makefile ------------------------------------------------------------------------ r1788 | zimmerma | 2012-03-07 13:46:16 +0100 (Wed, 07 Mar 2012) | 3 lines Changed paths: M /trunk/README.dev M /trunk/build.vc10/config.h M /trunk/configure.in M /trunk/ecm-gmp.h change version to 6.4.1-rc1 switched assertions to off for the release candidate ------------------------------------------------------------------------ r1787 | zimmerma | 2012-03-07 13:13:20 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/listz.c M /trunk/stage2.c [listz.c] cleanup, and removed some dead code in #if 0 ... #endif ------------------------------------------------------------------------ r1786 | zimmerma | 2012-03-07 10:40:51 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm_ntt.c M /trunk/listz.c fix NEGATED_ROOTS=1 once for all ------------------------------------------------------------------------ r1785 | zimmerma | 2012-03-07 10:12:57 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/main.c [main.c] -treefile is valid for ECM only ------------------------------------------------------------------------ r1784 | zimmerma | 2012-03-07 10:01:19 +0100 (Wed, 07 Mar 2012) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] added test with -treefile ------------------------------------------------------------------------ r1783 | zimmerma | 2012-03-05 08:56:02 +0100 (Mon, 05 Mar 2012) | 3 lines Changed paths: M /trunk/ecm.c [ecm.c] with -v -v, print A=... and x0=... for coherence with command line options ------------------------------------------------------------------------ r1782 | zimmerma | 2012-03-02 17:27:57 +0100 (Fri, 02 Mar 2012) | 3 lines Changed paths: M /trunk/README.dev M /trunk/TODO [README.dev] added check of -treefile before a release [TODO] at some point we should raise sigma to 64 bits ------------------------------------------------------------------------ r1781 | bouvierc | 2012-03-01 17:43:16 +0100 (Thu, 01 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu M /trunk/gpu/gpu_ecm/main.c Better measurement of actual running time. ------------------------------------------------------------------------ r1779 | bouvierc | 2012-03-01 13:44:35 +0100 (Thu, 01 Mar 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/check.sh M /trunk/gpu/gpu_ecm/cudakernel.cu M /trunk/gpu/gpu_ecm/main.c Don't do modular reduction anymore in GPU arithmetic. ------------------------------------------------------------------------ r1778 | bouvierc | 2012-02-29 15:50:23 +0100 (Wed, 29 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu M /trunk/gpu/gpu_ecm/main.c Prepare the code to use Montgomery-Svoboda algorithm for REDC ------------------------------------------------------------------------ r1777 | bouvierc | 2012-02-29 15:49:02 +0100 (Wed, 29 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c Add an include when linked for GPU-ECM ------------------------------------------------------------------------ r1775 | bouvierc | 2012-02-28 15:50:25 +0100 (Tue, 28 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/main.c Batch mode: allow to save and load s from a file. ------------------------------------------------------------------------ r1774 | bouvierc | 2012-02-27 11:45:40 +0100 (Mon, 27 Feb 2012) | 2 lines Changed paths: M /trunk/test.ecm Replace ecm and ./ecm by $ECM in two lines of the test file. ------------------------------------------------------------------------ r1773 | brian_gladman | 2012-02-24 16:47:31 +0100 (Fri, 24 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm/ecm.vcxproj.filters ------------------------------------------------------------------------ r1772 | brian_gladman | 2012-02-24 16:43:18 +0100 (Fri, 24 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/config.h M /trunk/build.vc10/gpu_ecm/gpu_ecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj.filters Update the gpu_ecm build for Windows ------------------------------------------------------------------------ r1771 | bouvierc | 2012-02-24 15:15:31 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu M /trunk/gpu/gpu_ecm/cudakernel.h Make the CPU code handling the GPU clearer. ------------------------------------------------------------------------ r1770 | bouvierc | 2012-02-24 13:58:59 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h Minor change in main.c. Add some comments. ------------------------------------------------------------------------ r1769 | bouvierc | 2012-02-24 13:43:26 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c Fix a problem reported by Brian Gladman. ------------------------------------------------------------------------ r1768 | bouvierc | 2012-02-24 11:24:31 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h All factors found and cofactors are printed the same way. ------------------------------------------------------------------------ r1767 | bouvierc | 2012-02-24 11:02:51 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/main.c Use the struct mpcandi_t. ------------------------------------------------------------------------ r1766 | bouvierc | 2012-02-24 10:45:26 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/utils.c M /trunk/resume.c Write the right program name and right version in resume file. ------------------------------------------------------------------------ r1765 | bouvierc | 2012-02-24 10:32:36 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudakernel.cu M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h More clear variables' name. More comment in main.c ------------------------------------------------------------------------ r1764 | bouvierc | 2012-02-24 09:47:36 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h Use write_resumefile_line from GMP-ECM. ------------------------------------------------------------------------ r1763 | zimmerma | 2012-02-24 08:19:43 +0100 (Fri, 24 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/check.sh [check.sh] translated error message in english ------------------------------------------------------------------------ r1762 | bouvierc | 2012-02-23 18:19:07 +0100 (Thu, 23 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/README M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h Using more functions from GMP-ECM. ------------------------------------------------------------------------ r1761 | bouvierc | 2012-02-23 18:18:26 +0100 (Thu, 23 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/check.sh Keep temp files when an error occurs in check.sh. ------------------------------------------------------------------------ r1760 | bouvierc | 2012-02-23 17:55:03 +0100 (Thu, 23 Feb 2012) | 2 lines Changed paths: M /trunk/ecm-ecm.h Delete duplicate prototypes. ------------------------------------------------------------------------ r1759 | bouvierc | 2012-02-23 17:11:10 +0100 (Thu, 23 Feb 2012) | 2 lines Changed paths: M /trunk/auxi.c M /trunk/ecm-ecm.h M /trunk/main.c Move some functions out of main.c in order to use them with GPU-ECM. ------------------------------------------------------------------------ r1758 | bouvierc | 2012-02-23 16:23:58 +0100 (Thu, 23 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/def.h M /trunk/gpu/gpu_ecm/main.c M /trunk/gpu/gpu_ecm/utils.c M /trunk/gpu/gpu_ecm/utils.h Start to use GMP-ECM functions in GPU-ECM when they exist. ------------------------------------------------------------------------ r1757 | bouvierc | 2012-02-23 14:54:11 +0100 (Thu, 23 Feb 2012) | 4 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile A /trunk/gpu/gpu_ecm/README.dev D /trunk/gpu/gpu_ecm/cudaarith.cu D /trunk/gpu/gpu_ecm/cudaarith.h A /trunk/gpu/gpu_ecm/cudakernel.cu (from /trunk/gpu/gpu_ecm/cudautils.cu:1756) A /trunk/gpu/gpu_ecm/cudakernel.h (from /trunk/gpu/gpu_ecm/cudautils.h:1756) D /trunk/gpu/gpu_ecm/cudautils.cu D /trunk/gpu/gpu_ecm/cudautils.h A /trunk/gpu/gpu_ecm/def.h (from /trunk/gpu/gpu_ecm/main.h:1756) A /trunk/gpu/gpu_ecm/main.c (from /trunk/gpu/gpu_ecm/main.cu:1756) D /trunk/gpu/gpu_ecm/main.cu D /trunk/gpu/gpu_ecm/main.h A /trunk/gpu/gpu_ecm/utils.c (from /trunk/gpu/gpu_ecm/utils.cu:1756) D /trunk/gpu/gpu_ecm/utils.cu M /trunk/gpu/gpu_ecm/utils.h Reorganization of the code. C code is put in C files compiled with gcc and Cuda code is put in .cu files and compiled with nvcc. ------------------------------------------------------------------------ r1756 | bouvierc | 2012-02-22 21:18:28 +0100 (Wed, 22 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/utils.cu Fix a bug reported by Brian Gladman. ------------------------------------------------------------------------ r1755 | bouvierc | 2012-02-22 16:12:37 +0100 (Wed, 22 Feb 2012) | 5 lines Changed paths: M /trunk/ecm.c M /trunk/main.c M /trunk/test.ecm [ecm.c] Fix a problem for batch mode 2 when the value obtained from the parametrization is printed [main.c] Print which batch mode is used. [test.ecm] Add a test for batch mode 2. ------------------------------------------------------------------------ r1754 | bouvierc | 2012-02-22 16:09:58 +0100 (Wed, 22 Feb 2012) | 2 lines Changed paths: D /trunk/gpu/gpu_ecm/obj D /trunk/gpu/gpu_ecm/test.sh Delete useless files ------------------------------------------------------------------------ r1753 | bouvierc | 2012-02-22 16:07:10 +0100 (Wed, 22 Feb 2012) | 4 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile A /trunk/gpu/gpu_ecm/README M /trunk/gpu/gpu_ecm/check.sh M /trunk/gpu/gpu_ecm/main.h [Makefile] A more-easy-to-use Makefile [README] README explaining how to compile and execute GPU-ECM and how to use check.sh ------------------------------------------------------------------------ r1752 | bouvierc | 2012-02-22 12:04:26 +0100 (Wed, 22 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c fix a bug in batch mode 2 ------------------------------------------------------------------------ r1751 | bouvierc | 2012-02-22 11:42:57 +0100 (Wed, 22 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ecm.c Fix batch mode 1 which didn't pass all the test because of previous commits. ------------------------------------------------------------------------ r1750 | bouvierc | 2012-02-22 10:21:27 +0100 (Wed, 22 Feb 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/main.cu M /trunk/gpu/gpu_ecm/utils.cu Fix some bugs for 32-bits machines. ------------------------------------------------------------------------ r1749 | bouvierc | 2012-02-21 18:25:00 +0100 (Tue, 21 Feb 2012) | 4 lines Changed paths: M /trunk/batch.c D /trunk/batchmode.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/main.c Now the choice of the which batch mode is used is not done by a #define. It can be passed as a argument: -batch[=1|2], with -batch being equivalent to -batch=1 ------------------------------------------------------------------------ r1748 | bouvierc | 2012-02-21 17:46:23 +0100 (Tue, 21 Feb 2012) | 2 lines Changed paths: A /trunk/batchmode.h A /trunk/ellparam_batch.c Forgot to add 2 new files in the last commit. ------------------------------------------------------------------------ r1747 | bouvierc | 2012-02-21 17:45:28 +0100 (Tue, 21 Feb 2012) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/batch.c M /trunk/ecm.c M /trunk/main.c Add elliptic parametrization for batch mode 2 Move the choice of A for batch mode in ecm.c (as the choice of sigma for Suyama) ------------------------------------------------------------------------ r1746 | brian_gladman | 2012-02-16 18:40:14 +0100 (Thu, 16 Feb 2012) | 1 line Changed paths: M /trunk/gpu/gpu_ecm/main.cu set IDLE priority for GPU application on Windows ------------------------------------------------------------------------ r1745 | brian_gladman | 2012-02-16 17:31:39 +0100 (Thu, 16 Feb 2012) | 1 line Changed paths: A /trunk/build.vc10/readme_gpu.txt add short readme for the Windows GPU build ------------------------------------------------------------------------ r1744 | brian_gladman | 2012-02-16 17:18:53 +0100 (Thu, 16 Feb 2012) | 1 line Changed paths: A /trunk/build.vc10/ecm.cuda.sln Add Visual Studio build for the GPU code ------------------------------------------------------------------------ r1743 | brian_gladman | 2012-02-16 17:15:45 +0100 (Thu, 16 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/getrusage.h A /trunk/build.vc10/gpu_ecm A /trunk/build.vc10/gpu_ecm/gpu_ecm.vcxproj M /trunk/gpu/gpu_ecm/main.cu M /trunk/gpu/gpu_ecm/main.h M /trunk/gpu/gpu_ecm/utils.cu M /trunk/gpu/gpu_ecm/utils.h M /trunk/gpu/modular_arithmetic.c ------------------------------------------------------------------------ r1742 | brian_gladman | 2012-02-13 22:33:07 +0100 (Mon, 13 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/config.h correct conflicted config.h ------------------------------------------------------------------------ r1741 | brian_gladman | 2012-02-13 22:06:19 +0100 (Mon, 13 Feb 2012) | 1 line Changed paths: M /trunk/bench_mulredc.c M /trunk/build.vc10/Makefile.am M /trunk/build.vc10/assembler/Makefile.am A /trunk/build.vc10/assembler/mulredc.asm M /trunk/build.vc10/assembler/mulredc.h A /trunk/build.vc10/assembler/redc.asm A /trunk/build.vc10/bench_mulredc A /trunk/build.vc10/bench_mulredc/Makefile.am A /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj A /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj.filters A /trunk/build.vc10/bench_mulredc/bench_mulredc.vcxproj.user M /trunk/build.vc10/config.h M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm.sln A /trunk/build.vc10/getrusage.c A /trunk/build.vc10/getrusage.h M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj.filters M /trunk/build.vc10/tune/tune.vcxproj M /trunk/build.vc10/tune/tune.vcxproj.filters Revamp the Windows Visual Studio 2010 build ------------------------------------------------------------------------ r1738 | zimmerma | 2012-02-12 23:46:20 +0100 (Sun, 12 Feb 2012) | 3 lines Changed paths: M /trunk/bench_mulredc.c [bench_mulredc.c] added some Svoboda code (not tested so far) and avoid some warnings if mpn_redc_2 and mpn_redc_n are not defined ------------------------------------------------------------------------ r1737 | dcleaver | 2012-02-12 23:24:37 +0100 (Sun, 12 Feb 2012) | 1 line Changed paths: M /trunk/Fgw.c M /trunk/test.ecm Clean up comments and make ecm test deterministic ------------------------------------------------------------------------ r1736 | zimmerma | 2012-02-12 23:05:25 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] fixed misplaced #ifdef, added comments, MPN_COPY -> mpn_copyi ------------------------------------------------------------------------ r1735 | dcleaver | 2012-02-12 21:50:37 +0100 (Sun, 12 Feb 2012) | 1 line Changed paths: M /trunk/test.ecm Added/fixed test to exercise r1734 patch ------------------------------------------------------------------------ r1734 | dcleaver | 2012-02-12 21:44:33 +0100 (Sun, 12 Feb 2012) | 1 line Changed paths: M /trunk/Fgw.c M /trunk/test.ecm Patched Fgw.c to allocate adequate memory for special inputs ------------------------------------------------------------------------ r1733 | zimmerma | 2012-02-12 20:15:50 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/bench_mulredc.c M /trunk/configure.in M /trunk/mpmod.h also consider mpn_redc_n in bench_mulredc (not yet used in mpmod.c) ------------------------------------------------------------------------ r1732 | brian_gladman | 2012-02-12 19:54:10 +0100 (Sun, 12 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/assembler/mulredc.h M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/tune/tune.vcxproj Make use of assembler code the default for Windows ------------------------------------------------------------------------ r1731 | zimmerma | 2012-02-12 17:34:08 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] fixed problem reported by David Cleaver on ecm-discuss ------------------------------------------------------------------------ r1730 | zimmerma | 2012-02-12 17:23:22 +0100 (Sun, 12 Feb 2012) | 5 lines Changed paths: M /trunk/bench_mulredc.c M /trunk/configure.in M /trunk/ecm-params.h.core2 M /trunk/mpmod.c [configure.in] bump minimal GMP version to 5.0.0 [bench_mulredc.c,mpmod.c] since mpn_sqr is defined in GMP >= 5, no need to test it [ecm-params.h.core2] updated ------------------------------------------------------------------------ r1729 | zimmerma | 2012-02-12 16:12:25 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/build.vc10/assembler/Makefile.am [assembler/Makefile.am] added missing mulredc.h ------------------------------------------------------------------------ r1726 | zimmerma | 2012-02-12 12:03:54 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/bench_mulredc.c M /trunk/mpmod.c A /trunk/mpmod.h define macros for different choices of redc modular arithmetic ------------------------------------------------------------------------ r1725 | zimmerma | 2012-02-12 08:35:20 +0100 (Sun, 12 Feb 2012) | 2 lines Changed paths: M /trunk/build.vc10/Makefile.am [build.vc10/Makefile.am] removed non-existent file ecm-params.h.x64.core2 ------------------------------------------------------------------------ r1724 | brian_gladman | 2012-02-12 00:07:15 +0100 (Sun, 12 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/Makefile.am add distributed in build.vc10 to files to makefile.am ------------------------------------------------------------------------ r1723 | zimmerma | 2012-02-11 14:37:39 +0100 (Sat, 11 Feb 2012) | 15 lines Changed paths: M /trunk/README M /trunk/bench_mulredc.c M /trunk/ecm-impl.h M /trunk/ecm-params.h.alpha-ev56 M /trunk/ecm-params.h.armv5tel M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 M /trunk/ecm-params.h.corei5 M /trunk/ecm-params.h.hppa2.0 M /trunk/ecm-params.h.ia64 M /trunk/ecm-params.h.mips64el M /trunk/ecm-params.h.pentium-m M /trunk/ecm-params.h.pentium4 M /trunk/ecm-params.h.powerpc970 M /trunk/ecm-params.h.sparc64 M /trunk/main.c M /trunk/mpmod.c M /trunk/test.pm1 M /trunk/tune.c Complete rewrite of the tuning mechanism for mulredc and sqrredc: instead of having a simple threshold, we have a full table for each one up to 20 limbs, where each entry for n limbs is an integer saying which function(s) should be used for the modular multiplication or squaring. Those tables are computed by bench_mulredc. On processors where assembly redc is available, I suggest we always have --enable-asm-redc, since the best routine will be chosen by bench_mulredc. On processors where assembly redc is not available, I suggest we still can use/compile bench_mulredc to choose the best routines. Ultimately bench_mulredc should be incorporated into "make tune". ------------------------------------------------------------------------ r1722 | brian_gladman | 2012-02-11 12:09:52 +0100 (Sat, 11 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/ecm/Makefile.am M /trunk/build.vc10/libecm/Makefile.am M /trunk/build.vc10/tune/Makefile.am Add IDE filters to the GMP-ECM distrubution ------------------------------------------------------------------------ r1721 | zimmerma | 2012-02-11 09:45:15 +0100 (Sat, 11 Feb 2012) | 2 lines Changed paths: M /trunk/build.vc10/Makefile.am A /trunk/build.vc10/tune/Makefile.am M /trunk/configure.in another try to fix missing vc10 files ------------------------------------------------------------------------ r1720 | brian_gladman | 2012-02-10 21:40:28 +0100 (Fri, 10 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/tests.py another Windows build correction ------------------------------------------------------------------------ r1719 | zimmerma | 2012-02-10 21:38:19 +0100 (Fri, 10 Feb 2012) | 3 lines Changed paths: M /trunk/bench_mulredc.c M /trunk/x86_64/Makefile.am M /trunk/x86_64/mulredc.h [x86_64] enable back redc3 code (can be useful for modular squaring) [bench_mulredc.c] print results of tuning at the end ------------------------------------------------------------------------ r1718 | zimmerma | 2012-02-10 20:52:57 +0100 (Fri, 10 Feb 2012) | 2 lines Changed paths: M /trunk/Makefile.am A /trunk/build.vc10/Makefile.am A /trunk/build.vc10/assembler/Makefile.am A /trunk/build.vc10/ecm/Makefile.am A /trunk/build.vc10/libecm/Makefile.am M /trunk/configure.in put in "make dist" missing build.vc10 files (to be checked) ------------------------------------------------------------------------ r1717 | zimmerma | 2012-02-10 20:12:15 +0100 (Fri, 10 Feb 2012) | 3 lines Changed paths: M /trunk/ecm.c [ecm.c] for the batch mode and A=4d-2 with d "random", the torsion smoothness multiplier is 1/(3*3^(1/128)) = 0.330... and not 1/3 ! ------------------------------------------------------------------------ r1716 | brian_gladman | 2012-02-10 13:32:53 +0100 (Fri, 10 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/libecm/libecm.vcxproj minor windows build change ------------------------------------------------------------------------ r1715 | brian_gladman | 2012-02-10 10:49:38 +0100 (Fri, 10 Feb 2012) | 1 line Changed paths: M /trunk/build.vc10/ecm-params.h M /trunk/build.vc10/ecm-params.h.win32.amd M /trunk/build.vc10/ecm-params.h.win32.intel M /trunk/build.vc10/ecm-params.h.x64.amd M /trunk/build.vc10/ecm-params.h.x64.intel M /trunk/build.vc10/tune/tune.vcxproj M /trunk/build.vc10/tune/tune.vcxproj.filters correct windows tuning errors ------------------------------------------------------------------------ r1714 | zimmerma | 2012-02-09 12:42:54 +0100 (Thu, 09 Feb 2012) | 2 lines Changed paths: M /trunk/bench_mulredc.c [bench_mulredc.c] updated to measure more low-level functions ------------------------------------------------------------------------ r1713 | zimmerma | 2012-02-08 12:29:36 +0100 (Wed, 08 Feb 2012) | 3 lines Changed paths: M /trunk/ecm.c [ecm.c] corrected the value of BATCH_EXTRA_SMOOTHNESS for GMP_NUMB_BITS >= 64, where we use A=4d-2 with d a square ------------------------------------------------------------------------ r1712 | zimmerma | 2012-02-07 22:24:56 +0100 (Tue, 07 Feb 2012) | 4 lines Changed paths: M /trunk/mpmod.c [mpmod.c] new function sqrredc in C (not used yet because slower than the assembly mulredc) use redc_basecase_n in mpresn_sqr and mpresn_mul: small speedup ------------------------------------------------------------------------ r1711 | zimmerma | 2012-02-06 17:15:47 +0100 (Mon, 06 Feb 2012) | 3 lines Changed paths: M /trunk/mpmod.c [mpmod.c] use TUNE_SQRREDC_THRESH and TUNE_MULREDC_THRESH in the mpresn_* functions ------------------------------------------------------------------------ r1710 | zimmerma | 2012-02-06 16:10:32 +0100 (Mon, 06 Feb 2012) | 3 lines Changed paths: M /trunk/README.dev M /trunk/ecmbench [ecmbench] use $1 to allow testing different versions [README.dev] added item for efficiency non-regression ------------------------------------------------------------------------ r1708 | zimmerma | 2012-02-06 15:58:01 +0100 (Mon, 06 Feb 2012) | 2 lines Changed paths: M /trunk/ecm.c M /trunk/ecm2.c M /trunk/lucas.c M /trunk/mpmod.c M /trunk/pm1fs2.c M /trunk/pp1.c M /trunk/tune.c replaced all occurrences of mpres_mul (a, x, x, m) by mpres_sqr (a, x, m) ------------------------------------------------------------------------ r1706 | zimmerma | 2012-02-06 14:34:35 +0100 (Mon, 06 Feb 2012) | 7 lines Changed paths: M /trunk/NEWS M /trunk/README.dev M /trunk/configure.in M /trunk/ecm-impl.h A /trunk/ecm-params.h.corei5 A /trunk/ecmbench M /trunk/mpmod.c M /trunk/tune.c [tune.c] cleaned up comptutation of TUNE_MULREDC_THRESH and TUNE_SQRREDC_THRESH [ecm-params.h.corei5] new parameters for Core i5 [configure.in] now recognize Core i5 [ecmbench] new bench utility [ecm-impl.h] new macro MULREDC_ASSEMBLY_MAX [mpmod.c] use MULREDC_ASSEMBLY_MAX ------------------------------------------------------------------------ r1703 | zimmerma | 2012-02-04 09:47:09 +0100 (Sat, 04 Feb 2012) | 2 lines Changed paths: M /trunk/build.vc10/config.h M /trunk/configure.in switch version to 6.5-dev, and assertions on by default ------------------------------------------------------------------------ r1701 | zimmerma | 2012-02-03 20:59:21 +0100 (Fri, 03 Feb 2012) | 2 lines Changed paths: M /trunk/batch.c M /trunk/mpmod.c make new batch-mode code also work with --disable-asm-redc ------------------------------------------------------------------------ r1700 | zimmerma | 2012-02-03 17:14:22 +0100 (Fri, 03 Feb 2012) | 6 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/mpmod.c M /trunk/pm1fs2.c [pm1fs2.c] renamed mpmod_copy into mpmod_init_set [batch.c] use new function mpresn_pad [ecm-impl.h] added mult_modulus (currrently initialized to N) [mpmod.c] simplified the mpresn_* functions (assuming repr=ECM_MOD_MODMULN) [ecm.c] in batch mode, force repr=ECM_MOD_MODMULN ------------------------------------------------------------------------ r1697 | zimmerma | 2012-02-03 10:07:59 +0100 (Fri, 03 Feb 2012) | 4 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/mpmod.c [ecm-impl.h] added missing prototypes [batch.c] removed debug printf() statements [mpmod.c] fixed compiler warnings ------------------------------------------------------------------------ r1696 | zimmerma | 2012-02-02 20:08:19 +0100 (Thu, 02 Feb 2012) | 2 lines Changed paths: M /trunk/NEWS [NEWS] added --enable-mulredc-svodoba ------------------------------------------------------------------------ r1695 | bouvierc | 2012-02-02 15:57:09 +0100 (Thu, 02 Feb 2012) | 4 lines Changed paths: M /trunk/batch.c M /trunk/mpmod.c Replace mpz operations by mpn operations in dup_add in batch.c For now only mul, sqr, add and sub have been modified. Reduction have not changed. ------------------------------------------------------------------------ r1694 | kruppa | 2012-02-02 15:52:19 +0100 (Thu, 02 Feb 2012) | 2 lines Changed paths: M /trunk/configure.in M /trunk/x86_64/mulredc.m4 Support for Svoboda mulredc in x86_64/. Matching support in mpmod.c TBD ------------------------------------------------------------------------ r1693 | zimmerma | 2012-01-30 17:58:24 +0100 (Mon, 30 Jan 2012) | 2 lines Changed paths: M /trunk/getprime.c [getprime.c] fixed typo in commented printf() ------------------------------------------------------------------------ r1690 | kruppa | 2012-01-16 16:00:54 +0100 (Mon, 16 Jan 2012) | 2 lines Changed paths: M /trunk/mpzspv.c Slight cleanup ------------------------------------------------------------------------ r1684 | zimmerma | 2012-01-09 14:12:38 +0100 (Mon, 09 Jan 2012) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] perform lazy reduction in REDC when N < B^n/4 ------------------------------------------------------------------------ r1678 | zimmerma | 2012-01-07 16:45:28 +0100 (Sat, 07 Jan 2012) | 2 lines Changed paths: M /trunk/INSTALL-ecm M /trunk/Makefile.am M /trunk/configure.in removed leftover references to build.vc9 ------------------------------------------------------------------------ r1677 | zimmerma | 2012-01-06 18:32:22 +0100 (Fri, 06 Jan 2012) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm_cc13/Makefile cleaned up Makefiles ------------------------------------------------------------------------ r1676 | brian_gladman | 2012-01-04 18:14:22 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk D /trunk/build.vc9 Remove some errors from the Visual Studio readme.txt filesremove unmaintained Visual Studio 2008 build files ------------------------------------------------------------------------ r1675 | zimmerma | 2012-01-04 09:53:53 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/main.c [main.c] fixed warning on 32-bit processor ------------------------------------------------------------------------ r1674 | zimmerma | 2012-01-04 00:44:48 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/README.dev [README.dev] added tag for 6.4 ------------------------------------------------------------------------ r1672 | zimmerma | 2012-01-04 00:42:50 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/ChangeLog [ChangeLog] updated, this is the GMP-ECM 6.4 release ------------------------------------------------------------------------ r1671 | brian_gladman | 2012-01-04 00:39:25 +0100 (Wed, 04 Jan 2012) | 1 line Changed paths: M /trunk/build.vc10/readme.txt M /trunk/build.vc9/readme.txt Remove some errors from the Visual Studio readme.txt files ------------------------------------------------------------------------ r1670 | zimmerma | 2012-01-04 00:38:05 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: A /trunk/INSTALL-ecm (from /trunk/INSTALL:1662) [INSTALL-ecm] specific INSTALL file for GMP-ECM ------------------------------------------------------------------------ r1669 | zimmerma | 2012-01-04 00:34:36 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/mpmod.c M /trunk/resume.c removed compiler warnings found with gcc 4.6.1 ------------------------------------------------------------------------ r1668 | zimmerma | 2012-01-04 00:26:26 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/configure.in [configure.in] added code to check for Pentium 4 ------------------------------------------------------------------------ r1667 | zimmerma | 2012-01-04 00:24:49 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: D /trunk/INSTALL [INSTALL] removed from svn, since it is now autogenerated by the autotools ------------------------------------------------------------------------ r1666 | zimmerma | 2012-01-04 00:24:01 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/ChangeLog [ChangeLog] updated for 6.4 ------------------------------------------------------------------------ r1665 | zimmerma | 2012-01-04 00:21:36 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/TODO [TODO] removed several done items ------------------------------------------------------------------------ r1664 | zimmerma | 2012-01-04 00:21:13 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/README [README] added sample example ------------------------------------------------------------------------ r1663 | zimmerma | 2012-01-04 00:20:54 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] added INSTALL-ecm (INSTALL is now the generic GNU package file) ------------------------------------------------------------------------ r1662 | zimmerma | 2012-01-04 00:13:33 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/NEWS [NEWS] added one item ------------------------------------------------------------------------ r1661 | zimmerma | 2012-01-04 00:13:06 +0100 (Wed, 04 Jan 2012) | 3 lines Changed paths: M /trunk/ecm-gmp.h M /trunk/mpmod.c M /trunk/schoen_strass.c use of mpn_mullo_n when available check if _mpz_realloc sets value to 0 ------------------------------------------------------------------------ r1660 | zimmerma | 2012-01-04 00:12:03 +0100 (Wed, 04 Jan 2012) | 3 lines Changed paths: M /trunk/build.vc10/config.h M /trunk/build.vc9/config.h M /trunk/configure.in changed version to 6.4 added check for mpn_mullo_n ------------------------------------------------------------------------ r1659 | zimmerma | 2012-01-04 00:03:08 +0100 (Wed, 04 Jan 2012) | 2 lines Changed paths: M /trunk/build.vc10/readme.txt M /trunk/build.vc9/readme.txt fixed typos ------------------------------------------------------------------------ r1658 | zimmerma | 2012-01-03 23:49:23 +0100 (Tue, 03 Jan 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.pentium4 [ecm-params.h.pentium4] updated for ecm-6.4 ------------------------------------------------------------------------ r1657 | zimmerma | 2012-01-03 23:31:19 +0100 (Tue, 03 Jan 2012) | 3 lines Changed paths: M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 [ecm-params.h.athlon64] updated on a true AMD machine [ecm-params.h.core2] added model name ------------------------------------------------------------------------ r1656 | zimmerma | 2012-01-03 23:25:27 +0100 (Tue, 03 Jan 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.core2 [ecm-params.h.core2] updated for ecm-6.4 ------------------------------------------------------------------------ r1655 | zimmerma | 2012-01-03 23:20:11 +0100 (Tue, 03 Jan 2012) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 [ecm-params.h.athlon64] updated for ecm 6.4 ------------------------------------------------------------------------ r1653 | kruppa | 2012-01-03 04:28:57 +0100 (Tue, 03 Jan 2012) | 2 lines Changed paths: M /trunk/mpmod.c nn must be initialised before testing assertions involving nn ------------------------------------------------------------------------ r1647 | zimmerma | 2011-12-30 10:27:49 +0100 (Fri, 30 Dec 2011) | 2 lines Changed paths: M /trunk/INSTALL [INSTALL] modified instructions for GWNUM ------------------------------------------------------------------------ r1644 | zimmerma | 2011-12-27 16:28:42 +0100 (Tue, 27 Dec 2011) | 2 lines Changed paths: M /trunk/AUTHORS M /trunk/README.dev added Cyril Bouvier as author, small fixes in README.dev ------------------------------------------------------------------------ r1643 | zimmerma | 2011-12-27 16:22:31 +0100 (Tue, 27 Dec 2011) | 2 lines Changed paths: M /trunk/ChangeLog M /trunk/INSTALL M /trunk/NEWS M /trunk/README.dev M /trunk/build.vc10/config.h M /trunk/build.vc9/config.h M /trunk/configure.in prepare for the 6.4 release: now 6.4-rc1 ------------------------------------------------------------------------ r1642 | zimmerma | 2011-12-27 15:39:10 +0100 (Tue, 27 Dec 2011) | 3 lines Changed paths: M /trunk/main.c [main.c] on 64-bit processors, take d a square in batch mode, which gives a larger average torsion, and thus a larger success probability ------------------------------------------------------------------------ r1641 | zimmerma | 2011-12-27 12:06:43 +0100 (Tue, 27 Dec 2011) | 13 lines Changed paths: M /trunk/main.c M /trunk/pm1.c work on "bug" reported by Jason Papadopoulos: the choice between NTT and no-NTT code for P-1 was wrong. In fact, since revision 1558, we prefer the variant with the larger transform length, which is sometimes a bad choice. For example with the following resume file the new choice is twice as slow: METHOD=P-1; B1=10000000; N=29799904256775982671863388319999573561548825027149399972531599612392671227006866151136667908641695103422986028076864929902803267437351318167549013218980573566942647077444419419003164546362008247462049; X=0x58bcade9a21209a49e884562ffac2b1dc3041ba75aacb160628223b64bc056cd2212ea489c9dfebe3336df2359ad41cb5ddfa54f7e1ed908cf5b47feed64d7b7daf309751bd9d5aa5848079de14d5590d13be1; CHECKSUM=2706566245; PROGRAM=GMP-ECM 6.3; X0=0xd13920f9; WHO=jasonp@COMPUTER; TIME=Mon Dec 12 07:50:07 2011; Moreover even with -ntt we could not force the NTT code to be used. This patch only fixes that problem: we can now force the NTT code with -ntt for PM1. Apart from the efficiency problem above (still to be solved), there is another problem: in case both NTT and no-NTT are tried, and finally NTT is preferred, the computed B2 value is wrong (it is that of no-NTT). ------------------------------------------------------------------------ r1640 | brian_gladman | 2011-12-20 13:22:32 +0100 (Tue, 20 Dec 2011) | 1 line Changed paths: M /trunk/build.vc10/readme.txt update Visual Studio readme.txt to reflect output directory change ------------------------------------------------------------------------ r1639 | brian_gladman | 2011-12-20 09:03:09 +0100 (Tue, 20 Dec 2011) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/mp_lib.props M /trunk/build.vc10/tune/tune.vcxproj move Visual Studio build output directories up one level in the directory tree ------------------------------------------------------------------------ r1638 | kruppa | 2011-12-14 16:07:17 +0100 (Wed, 14 Dec 2011) | 7 lines Changed paths: M /trunk/ecm_ntt.c M /trunk/mpzspv.c M /trunk/pm1fs2.c M /trunk/sp.h Moved some NTT related functions from pm1fs2.c to mpzspv.c Added NTT mul function that does forward transforms, point-wise multiply, and inverse transforms one small-prime vector at a time, which slightly improves memory access locality (also perhaps in preparation for later disk-stored vectors) ------------------------------------------------------------------------ r1637 | brian_gladman | 2011-11-21 22:43:09 +0100 (Mon, 21 Nov 2011) | 1 line Changed paths: M /trunk/build.vc10/tune/tune.vcxproj Add _WIN64 compiler define to the Windows x64 tune build ------------------------------------------------------------------------ r1636 | bouvierc | 2011-11-09 21:59:11 +0100 (Wed, 09 Nov 2011) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/cudaarith.cu M /trunk/gpu/gpu_ecm/main.cu M /trunk/gpu/gpu_ecm/main.h M /trunk/gpu/gpu_ecm/utils.cu Translate comments from french to english in gpu_ecm's code ------------------------------------------------------------------------ r1635 | zimmerma | 2011-11-08 21:15:24 +0100 (Tue, 08 Nov 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/mpmod.c removed compiler warnings ------------------------------------------------------------------------ r1634 | zimmerma | 2011-11-08 21:11:47 +0100 (Tue, 08 Nov 2011) | 4 lines Changed paths: M /trunk/athlon/mulredc.h M /trunk/batch.c M /trunk/configure.in M /trunk/ecm-impl.h M /trunk/main.c M /trunk/mpmod.c M /trunk/pentium4/mulredc.h M /trunk/powerpc64/mulredc.h M /trunk/x86_64/mulredc.h changed NATIVE_REDC into USE_ASM_REDC to be coherent with --enable-asm-redc added new mpres_sqr function, to make it easier if/when we implement a faster modular squaring function ------------------------------------------------------------------------ r1633 | bouvierc | 2011-11-04 17:37:31 +0100 (Fri, 04 Nov 2011) | 3 lines Changed paths: A /trunk/gpu/gpu_ecm/check.sh To check the correctness of gpu_ecm use : ./check B1 ------------------------------------------------------------------------ r1632 | bouvierc | 2011-11-04 17:35:53 +0100 (Fri, 04 Nov 2011) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/cudaarith.cu M /trunk/gpu/gpu_ecm/cudautils.cu M /trunk/gpu/gpu_ecm/cudautils.h D /trunk/gpu/gpu_ecm/gpu_ecm M /trunk/gpu/gpu_ecm/main.cu M /trunk/gpu/gpu_ecm/main.h M /trunk/gpu/gpu_ecm/utils.cu M /trunk/gpu/gpu_ecm/utils.h New version of gpu_ecm (compatible with Fermi cards) ------------------------------------------------------------------------ r1631 | dcleaver | 2011-10-24 04:30:28 +0200 (Mon, 24 Oct 2011) | 1 line Changed paths: M /trunk/batch.c use original compute_s with increased MAX_HEIGHT ------------------------------------------------------------------------ r1630 | zimmerma | 2011-10-23 13:16:09 +0200 (Sun, 23 Oct 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/main.c [batch.c, main.c] removed useless code + gnu coding style ------------------------------------------------------------------------ r1629 | dcleaver | 2011-10-23 06:01:57 +0200 (Sun, 23 Oct 2011) | 1 line Changed paths: M /trunk/batch.c M /trunk/ecm-ecm.h M /trunk/factor.c M /trunk/main.c clear s in proper place, show correct time to calculate s, make output defines the same between ecm-ecm.h and ecm-impl.h ------------------------------------------------------------------------ r1628 | dcleaver | 2011-10-23 05:13:25 +0200 (Sun, 23 Oct 2011) | 1 line Changed paths: M /trunk/ecm-ecm.h corrected function prototype ------------------------------------------------------------------------ r1627 | dcleaver | 2011-10-23 05:08:25 +0200 (Sun, 23 Oct 2011) | 1 line Changed paths: M /trunk/batch.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/factor.c M /trunk/main.c update compute_s to allow larger B1, and compute s once per B1 ------------------------------------------------------------------------ r1626 | zimmerma | 2011-10-18 14:06:23 +0200 (Tue, 18 Oct 2011) | 2 lines Changed paths: M /trunk/ecm.c [ecm.c] added #define for batch mode smoothness constant ------------------------------------------------------------------------ r1625 | brian_gladman | 2011-10-15 20:03:36 +0200 (Sat, 15 Oct 2011) | 1 line Changed paths: M /trunk ------------------------------------------------------------------------ r1624 | zimmerma | 2011-10-13 11:36:46 +0200 (Thu, 13 Oct 2011) | 4 lines Changed paths: M /trunk/ecm.c M /trunk/rho.c [rho.c] added comments about EXTRA_SMOOTHNESS factor (Alex please complete) [ecm.c] take into account experimental factor of 3 less in torsion for batch mode (for expected number of curves and time) ------------------------------------------------------------------------ r1623 | brian_gladman | 2011-10-10 18:18:13 +0200 (Mon, 10 Oct 2011) | 1 line Changed paths: M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj.filters add batch.c to VC++ build ------------------------------------------------------------------------ r1622 | dcleaver | 2011-10-09 19:58:34 +0200 (Sun, 09 Oct 2011) | 1 line Changed paths: M /trunk/batch.c Test: removed some trailing white space ------------------------------------------------------------------------ r1621 | zimmerma | 2011-10-02 22:27:01 +0200 (Sun, 02 Oct 2011) | 2 lines Changed paths: M /trunk/batch.c [batch.c] fix on 32-bit machines when s has >= 2^31 bits ------------------------------------------------------------------------ r1620 | zimmerma | 2011-10-02 09:10:05 +0200 (Sun, 02 Oct 2011) | 2 lines Changed paths: M /trunk/Fgw.c [Fgw.c] added math.h header ------------------------------------------------------------------------ r1619 | zimmerma | 2011-10-01 17:12:52 +0200 (Sat, 01 Oct 2011) | 3 lines Changed paths: M /trunk/Fgw.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/factor.c M /trunk/main.c included code from David Cleaver to recognize inputs of the form k*b^n+c (only with GWNUM) ------------------------------------------------------------------------ r1618 | bouvierc | 2011-09-29 14:53:25 +0200 (Thu, 29 Sep 2011) | 2 lines Changed paths: M /trunk/ecm-ecm.h Modify MAX_B1 in order to avoid error during char to double conversion ------------------------------------------------------------------------ r1617 | zimmerma | 2011-09-29 14:47:29 +0200 (Thu, 29 Sep 2011) | 3 lines Changed paths: M /trunk/batch.c [batch.c] MAX_B1 -> MAX_B1_BATCH to avoid conflict with MAX_B1 defined in ecm-ecm.h ------------------------------------------------------------------------ r1616 | zimmerma | 2011-09-29 14:39:55 +0200 (Thu, 29 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c [batch.c] added comments and rewritten compute_s() ------------------------------------------------------------------------ r1615 | bouvierc | 2011-09-29 10:28:42 +0200 (Thu, 29 Sep 2011) | 3 lines Changed paths: M /trunk/batch.c New compute_s fonction with only one array to accumulate ------------------------------------------------------------------------ r1614 | zimmerma | 2011-09-29 08:16:12 +0200 (Thu, 29 Sep 2011) | 3 lines Changed paths: M /trunk/Fgw.c M /trunk/batch.c [Fgw.c] fixed error in ASSERT_ALWAYS [batch.c] added missing mpz_clear for s ------------------------------------------------------------------------ r1613 | zimmerma | 2011-09-28 16:46:16 +0200 (Wed, 28 Sep 2011) | 2 lines Changed paths: M /trunk/Fgw.c [Fgw.c] added constraints on gw_c, and fixed stupid error ------------------------------------------------------------------------ r1612 | zimmerma | 2011-09-28 16:43:52 +0200 (Wed, 28 Sep 2011) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/mpmod.c M /trunk/mul_fft.c removed dead code (reported by gcc 4.6.1) ------------------------------------------------------------------------ r1611 | zimmerma | 2011-09-28 14:24:29 +0200 (Wed, 28 Sep 2011) | 2 lines Changed paths: M /trunk/Fgw.c [Fgw.c] added assertions on gw_k ------------------------------------------------------------------------ r1610 | zimmerma | 2011-09-26 08:53:04 +0200 (Mon, 26 Sep 2011) | 2 lines Changed paths: M /trunk/factor.c [factor.c] set batch=0 in ecm_init ------------------------------------------------------------------------ r1609 | zimmerma | 2011-09-25 21:48:04 +0200 (Sun, 25 Sep 2011) | 2 lines Changed paths: M /trunk/Fgw.c M /trunk/ecm.c more changes from David Cleaver for GWNUM and batch mode ------------------------------------------------------------------------ r1608 | zimmerma | 2011-09-23 16:43:33 +0200 (Fri, 23 Sep 2011) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] added one test for -batch that did fail on 32-bit computers ------------------------------------------------------------------------ r1607 | zimmerma | 2011-09-23 16:28:24 +0200 (Fri, 23 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ecm-impl.h M /trunk/mpmod.c [batch.c] tiny speedup using new mpres_mul_2exp function ------------------------------------------------------------------------ r1606 | bouvierc | 2011-09-23 14:57:00 +0200 (Fri, 23 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c Correct the bug reported by David Cleaver on 32-bit system ------------------------------------------------------------------------ r1605 | zimmerma | 2011-09-23 08:45:28 +0200 (Fri, 23 Sep 2011) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] added one test for the batch mode ------------------------------------------------------------------------ r1604 | zimmerma | 2011-09-22 12:06:04 +0200 (Thu, 22 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/test.ecm [batch.c] check that A = 2 (mod 4) for the batch mode ------------------------------------------------------------------------ r1603 | zimmerma | 2011-09-21 07:50:50 +0200 (Wed, 21 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c [batch.c] better fix ------------------------------------------------------------------------ r1602 | zimmerma | 2011-09-20 17:41:13 +0200 (Tue, 20 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c [batch.c] small changes ------------------------------------------------------------------------ r1601 | zimmerma | 2011-09-20 17:29:40 +0200 (Tue, 20 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/resume.c fixed a few compiler warnings with batch mode ------------------------------------------------------------------------ r1600 | zimmerma | 2011-09-20 08:49:15 +0200 (Tue, 20 Sep 2011) | 2 lines Changed paths: M /trunk/main.c [main.c] fixed stupid error in batch mode (thanks David Cleaver) ------------------------------------------------------------------------ r1599 | brian_gladman | 2011-09-19 23:26:40 +0200 (Mon, 19 Sep 2011) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm/ecm.vcxproj.filters add batch mode file to Visual Studio 2010 builds ------------------------------------------------------------------------ r1598 | zimmerma | 2011-09-19 20:45:04 +0200 (Mon, 19 Sep 2011) | 2 lines Changed paths: M /trunk/batch.c M /trunk/ecm.c M /trunk/main.c cleanup of the batch mode ------------------------------------------------------------------------ r1597 | bouvierc | 2011-09-16 15:54:32 +0200 (Fri, 16 Sep 2011) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/TODO A /trunk/batch.c M /trunk/configure.in M /trunk/ecm-ecm.h M /trunk/ecm-gmp.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/factor.c M /trunk/main.c M /trunk/mpmod.c M /trunk/test.ecm Implementation of the batch mode (option -batch) Use Montgomery's parametrization to save 1 multiplication ------------------------------------------------------------------------ r1596 | zimmerma | 2011-09-15 18:41:42 +0200 (Thu, 15 Sep 2011) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/pp1.c got rid of the obsolete mpres_normalize and mpres_semi_normalize functions ------------------------------------------------------------------------ r1595 | zimmerma | 2011-09-03 09:33:12 +0200 (Sat, 03 Sep 2011) | 2 lines Changed paths: M /trunk/Fgw.c [Fgw.c] more changes from David Cleaver, for future new version of GWNUM ------------------------------------------------------------------------ r1594 | zimmerma | 2011-09-02 20:19:45 +0200 (Fri, 02 Sep 2011) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 [ecm-params.h.athlon64] updated (from David Cleaver) ------------------------------------------------------------------------ r1593 | zimmerma | 2011-09-02 16:47:44 +0200 (Fri, 02 Sep 2011) | 2 lines Changed paths: M /trunk/main.c [main.c] deal with MPIR in print_config() too ------------------------------------------------------------------------ r1592 | zimmerma | 2011-09-02 16:37:02 +0200 (Fri, 02 Sep 2011) | 2 lines Changed paths: M /trunk/README.dev [README.dev] added item for a new release ------------------------------------------------------------------------ r1591 | zimmerma | 2011-09-01 19:03:25 +0200 (Thu, 01 Sep 2011) | 2 lines Changed paths: M /trunk/Fgw.c M /trunk/INSTALL fixed typo in name of David Cleaver (sorry) ------------------------------------------------------------------------ r1590 | zimmerma | 2011-08-22 13:58:47 +0200 (Mon, 22 Aug 2011) | 3 lines Changed paths: M /trunk/Fgw.c M /trunk/INSTALL M /trunk/Makefile.am added changes contributed by David Cleaver to use gwnum 26.6 on Windows x64 with MingW64 in Msys ------------------------------------------------------------------------ r1589 | kruppa | 2011-07-25 16:11:38 +0200 (Mon, 25 Jul 2011) | 2 lines Changed paths: M /trunk/configure.in Patch to make configure.in work with autoconf 2.68, provided by Ralf Recker ------------------------------------------------------------------------ r1588 | zimmerma | 2011-06-30 13:37:01 +0200 (Thu, 30 Jun 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] bumped P-1 value ------------------------------------------------------------------------ r1587 | bouvierc | 2011-06-24 14:02:32 +0200 (Fri, 24 Jun 2011) | 3 lines Changed paths: M /trunk/gpu/gpu_ecm/Makefile M /trunk/gpu/gpu_ecm/cudaarith.cu M /trunk/gpu/gpu_ecm/cudautils.cu M /trunk/gpu/gpu_ecm/cudautils.h M /trunk/gpu/gpu_ecm/main.cu M /trunk/gpu/gpu_ecm/main.h M /trunk/gpu/gpu_ecm/test.sh M /trunk/gpu/gpu_ecm/utils.cu M /trunk/gpu/gpu_ecm/utils.h Latest improvements. ------------------------------------------------------------------------ r1586 | zimmerma | 2011-06-20 16:34:33 +0200 (Mon, 20 Jun 2011) | 3 lines Changed paths: M /trunk/main.c [main.c] fixed error message allow 1 as factor found (when input number is 1) ------------------------------------------------------------------------ r1585 | zimmerma | 2011-06-20 16:19:19 +0200 (Mon, 20 Jun 2011) | 2 lines Changed paths: M /trunk/main.c [main.c] GNU coding style ------------------------------------------------------------------------ r1584 | bouvierc | 2011-06-20 16:12:14 +0200 (Mon, 20 Jun 2011) | 3 lines Changed paths: M /trunk/main.c Add a test to forbid nonpositive integers. ------------------------------------------------------------------------ r1583 | bouvierc | 2011-06-20 15:11:36 +0200 (Mon, 20 Jun 2011) | 3 lines Changed paths: M /trunk/auxi.c Fix a bug for number of digits with negative number. ------------------------------------------------------------------------ r1582 | zimmerma | 2011-06-15 07:43:23 +0200 (Wed, 15 Jun 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] minimal size for ECM is now 67 digits ------------------------------------------------------------------------ r1581 | bouvierc | 2011-06-01 15:04:35 +0200 (Wed, 01 Jun 2011) | 5 lines Changed paths: A /trunk/gpu/gpu_ecm A /trunk/gpu/gpu_ecm/Makefile A /trunk/gpu/gpu_ecm/cudaarith.cu A /trunk/gpu/gpu_ecm/cudaarith.h A /trunk/gpu/gpu_ecm/cudautils.cu A /trunk/gpu/gpu_ecm/cudautils.h A /trunk/gpu/gpu_ecm/gpu_ecm A /trunk/gpu/gpu_ecm/main.cu A /trunk/gpu/gpu_ecm/main.h A /trunk/gpu/gpu_ecm/obj A /trunk/gpu/gpu_ecm/test.sh A /trunk/gpu/gpu_ecm/utils.cu A /trunk/gpu/gpu_ecm/utils.h gpu_ecm for GPU of compute capability 1.3 and above Optimize for 1024bits modulus but can be compiled for 256 and 512bits modulus with make v=1 and make v=2 ------------------------------------------------------------------------ r1580 | brian_gladman | 2011-05-31 23:44:26 +0200 (Tue, 31 May 2011) | 1 line Changed paths: M /trunk/build.vc10/tests.py add overall timing to the Windows test program (tests.py) ------------------------------------------------------------------------ r1579 | brian_gladman | 2011-05-31 17:33:52 +0200 (Tue, 31 May 2011) | 1 line Changed paths: M /trunk/build.vc10/config.h M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/tests.py update Windows builds to add missing defines and add OpenMP support ------------------------------------------------------------------------ r1578 | zimmerma | 2011-05-18 12:11:45 +0200 (Wed, 18 May 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] updated for P+1 ------------------------------------------------------------------------ r1577 | zimmerma | 2011-05-05 13:05:45 +0200 (Thu, 05 May 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] updated P+1 size ------------------------------------------------------------------------ r1576 | zimmerma | 2011-05-04 13:19:18 +0200 (Wed, 04 May 2011) | 2 lines Changed paths: M /trunk/test.ecm [test.ecm] added some tests to exercise patch from David Cleaver (r1575) ------------------------------------------------------------------------ r1575 | zimmerma | 2011-05-04 12:45:37 +0200 (Wed, 04 May 2011) | 4 lines Changed paths: M /trunk/TODO M /trunk/configure.in M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/lucas.c M /trunk/pp1.c added patch from David Cleaver to allow B1>=2^32 on machines where "unsigned long" has 32 bits only, by using "unsigned long long" ------------------------------------------------------------------------ r1574 | bouvierc | 2011-04-21 16:30:48 +0200 (Thu, 21 Apr 2011) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm_cc13/cudaarith.cu Correct a little mistake in the compare function. ------------------------------------------------------------------------ r1573 | bouvierc | 2011-04-20 17:40:57 +0200 (Wed, 20 Apr 2011) | 2 lines Changed paths: M /trunk/gpu/gpu_ecm_cc13/Makefile M /trunk/gpu/gpu_ecm_cc13/cudaarith.cu M /trunk/gpu/gpu_ecm_cc13/cudaarith.h M /trunk/gpu/gpu_ecm_cc13/cudautils.cu M /trunk/gpu/gpu_ecm_cc13/main.cu M /trunk/gpu/gpu_ecm_cc13/test.sh M /trunk/gpu/gpu_ecm_cc13/utils.h Some improvements (especially on multiplication). 30% gain in time. ------------------------------------------------------------------------ r1572 | bouvierc | 2011-04-14 15:33:22 +0200 (Thu, 14 Apr 2011) | 1 line Changed paths: M /trunk/gpu/gpu_ecm_cc13/cudautils.cu M /trunk/gpu/gpu_ecm_cc13/main.cu Corrections of the copy between CPU and GPU which only worked for half of the curves. ------------------------------------------------------------------------ r1571 | bouvierc | 2011-04-13 10:46:45 +0200 (Wed, 13 Apr 2011) | 1 line Changed paths: A /trunk/gpu/gpu_ecm_cc13 A /trunk/gpu/gpu_ecm_cc13/Makefile A /trunk/gpu/gpu_ecm_cc13/cudaarith.cu A /trunk/gpu/gpu_ecm_cc13/cudaarith.h A /trunk/gpu/gpu_ecm_cc13/cudautils.cu A /trunk/gpu/gpu_ecm_cc13/main.cu A /trunk/gpu/gpu_ecm_cc13/obj A /trunk/gpu/gpu_ecm_cc13/test.sh A /trunk/gpu/gpu_ecm_cc13/utils.h Implementation of ecm for NVIDIA GPU of compute capability 1.3\n./test.sh provides a example. ------------------------------------------------------------------------ r1570 | zimmerma | 2011-04-08 13:56:38 +0200 (Fri, 08 Apr 2011) | 2 lines Changed paths: M /trunk/TODO [TODO] added one item ------------------------------------------------------------------------ r1569 | zimmerma | 2011-04-08 08:31:37 +0200 (Fri, 08 Apr 2011) | 2 lines Changed paths: M /trunk/README [README] updated OpenPFGW url ------------------------------------------------------------------------ r1568 | zimmerma | 2011-03-30 13:05:10 +0200 (Wed, 30 Mar 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] updated for P-1 ------------------------------------------------------------------------ r1567 | brian_gladman | 2011-03-30 11:01:16 +0200 (Wed, 30 Mar 2011) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj Adjust stack size options for Windows ------------------------------------------------------------------------ r1566 | zimmerma | 2011-03-10 11:50:33 +0100 (Thu, 10 Mar 2011) | 2 lines Changed paths: M /trunk/champions.h [champions.h] updated P+1 minimum digits ------------------------------------------------------------------------ r1565 | zimmerma | 2011-01-18 17:44:09 +0100 (Tue, 18 Jan 2011) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing blank (commit test) ------------------------------------------------------------------------ r1564 | brian_gladman | 2011-01-14 17:46:08 +0100 (Fri, 14 Jan 2011) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm/ecm.vcxproj.filters M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj.filters Add champions.h to Windows builds ------------------------------------------------------------------------ r1563 | zimmerma | 2011-01-12 14:55:15 +0100 (Wed, 12 Jan 2011) | 3 lines Changed paths: M /trunk/main.c [main.c] print assertions too (if enabled) in the startup line, and cleaned up the corresponding code ------------------------------------------------------------------------ r1562 | kruppa | 2011-01-06 16:48:16 +0100 (Thu, 06 Jan 2011) | 2 lines Changed paths: M /trunk/x86_64/mulredc.m4 Better asm code for AMD cpus ------------------------------------------------------------------------ r1561 | kruppa | 2011-01-06 13:49:16 +0100 (Thu, 06 Jan 2011) | 2 lines Changed paths: M /trunk/Makefile.am A /trunk/champions.h M /trunk/main.c Put champions list in own file to avoid frequent updates to main.c ------------------------------------------------------------------------ r1560 | brian_gladman | 2010-12-30 16:19:42 +0100 (Thu, 30 Dec 2010) | 1 line Changed paths: M /trunk/build.vc10/config.h minor change to Windows win32 build configuration ------------------------------------------------------------------------ r1559 | kruppa | 2010-12-17 17:18:11 +0100 (Fri, 17 Dec 2010) | 2 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Fix spurious error messages introduced in last commit ------------------------------------------------------------------------ r1558 | kruppa | 2010-12-17 17:11:47 +0100 (Fri, 17 Dec 2010) | 4 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c Bugfix for choosing between NTT and non-NTT. Needs more testing before doing choice for P+1 the same way ------------------------------------------------------------------------ r1557 | zimmerma | 2010-12-17 16:14:22 +0100 (Fri, 17 Dec 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed one blank (test commit) ------------------------------------------------------------------------ r1556 | kruppa | 2010-12-17 14:18:02 +0100 (Fri, 17 Dec 2010) | 2 lines Changed paths: M /trunk/configure.in Link Woltman's GWNUM with -lpthread, abort if GWNUM+OpenMP is requested ------------------------------------------------------------------------ r1555 | zimmerma | 2010-12-09 15:32:17 +0100 (Thu, 09 Dec 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing blank (test) ------------------------------------------------------------------------ r1554 | zimmerma | 2010-12-05 22:21:25 +0100 (Sun, 05 Dec 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] now recognizes MPIR ------------------------------------------------------------------------ r1553 | zimmerma | 2010-11-24 16:13:11 +0100 (Wed, 24 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] we now need at least a p66 to enter the ECM top ten! ------------------------------------------------------------------------ r1552 | brian_gladman | 2010-11-22 20:17:10 +0100 (Mon, 22 Nov 2010) | 1 line Changed paths: M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/libecm/libecm.vcxproj M /trunk/build.vc10/mp_lib.props M /trunk/build.vc10/tune/tune.vcxproj minor Windows build changes ------------------------------------------------------------------------ r1551 | brian_gladman | 2010-11-22 13:33:06 +0100 (Mon, 22 Nov 2010) | 1 line Changed paths: M /trunk/build.vc10/config.h M /trunk/build.vc10/ecm/ecm.vcxproj M /trunk/build.vc10/ecm.sln M /trunk/build.vc10/libecm/libecm.vcxproj minor Windows build changes ------------------------------------------------------------------------ r1550 | zimmerma | 2010-11-19 15:03:23 +0100 (Fri, 19 Nov 2010) | 3 lines Changed paths: M /trunk/mul_fft.c [mul_fft.c] check for malloc return value in main memory allocation, and point to -maxmem option ------------------------------------------------------------------------ r1549 | zimmerma | 2010-11-18 21:14:06 +0100 (Thu, 18 Nov 2010) | 2 lines Changed paths: M /trunk/pm1fs2.c [pm1fs2.c] fixed typos in comments ------------------------------------------------------------------------ r1548 | zimmerma | 2010-11-18 20:57:40 +0100 (Thu, 18 Nov 2010) | 2 lines Changed paths: M /trunk/pm1.c [pm1.c] p-1 -> P-1 ------------------------------------------------------------------------ r1547 | zimmerma | 2010-11-12 15:12:10 +0100 (Fri, 12 Nov 2010) | 2 lines Changed paths: M /trunk/NEWS [NEWS] added more stuff ------------------------------------------------------------------------ r1546 | zimmerma | 2010-11-12 15:01:48 +0100 (Fri, 12 Nov 2010) | 3 lines Changed paths: M /trunk/NEWS M /trunk/configure.in [configure.in] fixed problem with SSE2 support (http://trac.sagemath.org/sage_trac/ticket/10252) ------------------------------------------------------------------------ r1545 | zimmerma | 2010-11-10 11:16:13 +0100 (Wed, 10 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing blank ------------------------------------------------------------------------ r1544 | zimmerma | 2010-11-10 11:04:02 +0100 (Wed, 10 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed another trailing blank ------------------------------------------------------------------------ r1543 | zimmerma | 2010-11-10 10:52:14 +0100 (Wed, 10 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed another trailing blank (another commit log test) ------------------------------------------------------------------------ r1542 | zimmerma | 2010-11-10 10:36:46 +0100 (Wed, 10 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing blank (commit log test) ------------------------------------------------------------------------ r1541 | zimmerma | 2010-11-08 17:50:20 +0100 (Mon, 08 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing blank ------------------------------------------------------------------------ r1540 | zimmerma | 2010-11-08 07:53:35 +0100 (Mon, 08 Nov 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] removed trailing space ------------------------------------------------------------------------ r1539 | brian_gladman | 2010-10-30 19:31:28 +0200 (Sat, 30 Oct 2010) | 1 line Changed paths: A /trunk/build.vc10 A /trunk/build.vc10/assembler A /trunk/build.vc10/assembler/a_win32a_mulredc.asm A /trunk/build.vc10/assembler/a_win32a_redc.asm A /trunk/build.vc10/assembler/a_win32p_mulredc.asm A /trunk/build.vc10/assembler/a_win32p_redc.asm A /trunk/build.vc10/assembler/a_x64_mulredc.asm A /trunk/build.vc10/assembler/a_x64_redc.asm A /trunk/build.vc10/assembler/mulredc.h A /trunk/build.vc10/assembler/test_mulredc.c A /trunk/build.vc10/config.h A /trunk/build.vc10/ecm A /trunk/build.vc10/ecm/ecm.vcxproj A /trunk/build.vc10/ecm/ecm.vcxproj.filters A /trunk/build.vc10/ecm-params.h A /trunk/build.vc10/ecm-params.h.win32.amd A /trunk/build.vc10/ecm-params.h.win32.intel A /trunk/build.vc10/ecm-params.h.x64.amd A /trunk/build.vc10/ecm-params.h.x64.intel A /trunk/build.vc10/ecm.sln A /trunk/build.vc10/file_copy.bat A /trunk/build.vc10/libecm A /trunk/build.vc10/libecm/libecm.vcxproj A /trunk/build.vc10/libecm/libecm.vcxproj.filters A /trunk/build.vc10/mp_lib.props A /trunk/build.vc10/mul_fft-params.h.win32.amd A /trunk/build.vc10/mul_fft-params.h.win32.intel A /trunk/build.vc10/mul_fft-params.h.x64.amd A /trunk/build.vc10/mul_fft-params.h.x64.intel A /trunk/build.vc10/readme.txt A /trunk/build.vc10/tests.py A /trunk/build.vc10/tune A /trunk/build.vc10/tune/tune.vcxproj A /trunk/build.vc10/tune/tune.vcxproj.filters A /trunk/build.vc10/vsyasm.props A /trunk/build.vc10/vsyasm.targets A /trunk/build.vc10/vsyasm.xml Add a Visual Studio 2010 build ------------------------------------------------------------------------ r1538 | brian_gladman | 2010-10-30 19:07:41 +0200 (Sat, 30 Oct 2010) | 1 line Changed paths: M /trunk/longlong.h Further correction of _PROTO define ------------------------------------------------------------------------ r1537 | brian_gladman | 2010-10-30 16:57:34 +0200 (Sat, 30 Oct 2010) | 2 lines Changed paths: M /trunk/mul_fft.c Correct bug in definition of _PROTO in mul_fft.c ------------------------------------------------------------------------ r1536 | zimmerma | 2010-10-22 10:28:43 +0200 (Fri, 22 Oct 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] updated champions table ------------------------------------------------------------------------ r1535 | zimmerma | 2010-09-10 16:58:27 +0200 (Fri, 10 Sep 2010) | 2 lines Changed paths: A /trunk/gpu/modular_arithmetic.h [modular_arithmetic.h] header file for routines in modular_arithmetic.c ------------------------------------------------------------------------ r1534 | zimmerma | 2010-09-10 16:57:45 +0200 (Fri, 10 Sep 2010) | 2 lines Changed paths: A /trunk/gpu/stage1-c.c [stage1-c.c] new file, plain-C version of stage1.c ------------------------------------------------------------------------ r1533 | zimmerma | 2010-09-10 16:41:33 +0200 (Fri, 10 Sep 2010) | 3 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/modular_arithmetic.c M /trunk/gpu/prototype.c M /trunk/gpu/prototype.h now stage1-c seems to give similar results as stage1 (but the efficiency can still be improved) ------------------------------------------------------------------------ r1532 | zimmerma | 2010-09-08 13:34:01 +0200 (Wed, 08 Sep 2010) | 2 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/modular_arithmetic.c cleanup in file modular_arithmetic.c ------------------------------------------------------------------------ r1531 | zimmerma | 2010-09-08 13:19:50 +0200 (Wed, 08 Sep 2010) | 2 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/prototype.c M /trunk/gpu/stage1.c small changes to avoid compiler warnings ------------------------------------------------------------------------ r1530 | zimmerma | 2010-09-08 11:50:53 +0200 (Wed, 08 Sep 2010) | 3 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/prototype.c M /trunk/gpu/stage1.c fixed computation of d=(a+2)/4 mod N in prototype.c, added copyright notice, and started to clean up code ------------------------------------------------------------------------ r1529 | zimmerma | 2010-09-08 11:04:36 +0200 (Wed, 08 Sep 2010) | 2 lines Changed paths: M /trunk/INSTALL [INSTALL] updated info about sparc problem ------------------------------------------------------------------------ r1528 | zimmerma | 2010-09-07 08:37:27 +0200 (Tue, 07 Sep 2010) | 2 lines Changed paths: M /trunk/INSTALL [INSTALL] added know problem on sparc with GCC 4.4.5 ------------------------------------------------------------------------ r1527 | kruppa | 2010-07-31 15:35:01 +0200 (Sat, 31 Jul 2010) | 3 lines Changed paths: M /trunk/athlon/Makefile.am M /trunk/pentium4/Makefile.am M /trunk/powerpc64/Makefile.am M /trunk/x86_64/Makefile.am Avoid spurious dependency of libmulredc on GMP and libm. ------------------------------------------------------------------------ r1526 | kruppa | 2010-07-31 15:33:57 +0200 (Sat, 31 Jul 2010) | 5 lines Changed paths: M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc1.asm M /trunk/x86_64/mulredc1.m4 PIC-ify calls to abort(). We always call abort@plt now, which on Linux seems to work even in a static library, but may not be portable. Needs testing. ------------------------------------------------------------------------ r1525 | zimmerma | 2010-07-28 11:09:59 +0200 (Wed, 28 Jul 2010) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] patch from Laurent Fousse (libecm.so was not linked against gmp) ------------------------------------------------------------------------ r1524 | kruppa | 2010-07-24 13:35:53 +0200 (Sat, 24 Jul 2010) | 3 lines Changed paths: M /trunk/Makefile.am Removed linker flags from LDADD to avoid spurious dependencies Include mulredc in libecm.la only if asm redc is actually used ------------------------------------------------------------------------ r1523 | zimmerma | 2010-07-09 11:14:01 +0200 (Fri, 09 Jul 2010) | 2 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/modular_arithmetic.c fixed bug in mul() ------------------------------------------------------------------------ r1522 | zimmerma | 2010-07-09 10:29:10 +0200 (Fri, 09 Jul 2010) | 2 lines Changed paths: M /trunk/gpu/makefile [makefile] added target ------------------------------------------------------------------------ r1521 | feltin | 2010-07-08 17:07:38 +0200 (Thu, 08 Jul 2010) | 1 line Changed paths: A /trunk/gpu/getprime.c A /trunk/gpu/getprime.h M /trunk/gpu/modular_arithmetic.c ------------------------------------------------------------------------ r1520 | feltin | 2010-07-07 17:09:57 +0200 (Wed, 07 Jul 2010) | 2 lines Changed paths: A /trunk/gpu/modular_arithmetic.c Modular arithmetic version in C without using the GMP library ------------------------------------------------------------------------ r1519 | zimmerma | 2010-07-02 16:34:15 +0200 (Fri, 02 Jul 2010) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] better fix for #10648 (contributed from Vincent Lefèvre) ------------------------------------------------------------------------ r1518 | zimmerma | 2010-07-01 17:38:51 +0200 (Thu, 01 Jul 2010) | 4 lines Changed paths: M /trunk/configure.in [configure.in] better fix from Vincent Lefevre: even on x86_64, we might use GMP with ABI=32, in which case we shouldn't use the (64-bit) assembly redc from x86_64 ------------------------------------------------------------------------ r1517 | zimmerma | 2010-07-01 17:31:34 +0200 (Thu, 01 Jul 2010) | 3 lines Changed paths: M /trunk/Makefile.am [Makefile.am] patch to solve bug #10648 from tracker, however I'm not sure this solution is the right one, if needed we can revert it. ------------------------------------------------------------------------ r1516 | zimmerma | 2010-06-30 15:50:19 +0200 (Wed, 30 Jun 2010) | 2 lines Changed paths: M /trunk/configure.in [configure.in] patch from Vincent Lefevre (see bug tracker #10646) ------------------------------------------------------------------------ r1515 | zimmerma | 2010-06-30 14:54:20 +0200 (Wed, 30 Jun 2010) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] added comment ------------------------------------------------------------------------ r1514 | zimmerma | 2010-06-30 14:28:04 +0200 (Wed, 30 Jun 2010) | 4 lines Changed paths: M /trunk/configure.in [configure.in] fixed bug reported by Vincent Lefevre: incorrect configure --help output for --enable-asm-redc (#10649 on bug tracker) ------------------------------------------------------------------------ r1513 | zimmerma | 2010-06-30 14:18:56 +0200 (Wed, 30 Jun 2010) | 2 lines Changed paths: M /trunk/configure.in [configure.in] fixed typo ------------------------------------------------------------------------ r1512 | feltin | 2010-06-21 11:03:53 +0200 (Mon, 21 Jun 2010) | 2 lines Changed paths: M /trunk/gpu/makefile M /trunk/gpu/prototype.c M /trunk/gpu/prototype.h M /trunk/gpu/stage1.c Stage1 of algorithm ECM (version 2) ------------------------------------------------------------------------ r1511 | zimmerma | 2010-06-15 16:25:05 +0200 (Tue, 15 Jun 2010) | 2 lines Changed paths: M /trunk/INSTALL [INSTALL] new section with known problems ------------------------------------------------------------------------ r1510 | feltin | 2010-06-15 15:48:17 +0200 (Tue, 15 Jun 2010) | 2 lines Changed paths: A /trunk/gpu/makefile A /trunk/gpu/prototype.c A /trunk/gpu/prototype.h A /trunk/gpu/stage1.c first version of ECM (using GMP) ------------------------------------------------------------------------ r1509 | zimmerma | 2010-06-15 09:15:29 +0200 (Tue, 15 Jun 2010) | 2 lines Changed paths: A /trunk/gpu [gpu] new subdirectory to experiment with GPU code for stage 1 ------------------------------------------------------------------------ r1508 | zimmerma | 2010-05-22 12:39:45 +0200 (Sat, 22 May 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] updated smallest champion size for P-1 ------------------------------------------------------------------------ r1507 | zimmerma | 2010-05-17 00:38:19 +0200 (Mon, 17 May 2010) | 2 lines Changed paths: M /trunk/main.c [main.c] now we need at least 64 digits to enter the ECM champion list! ------------------------------------------------------------------------ r1506 | kruppa | 2010-04-27 13:52:26 +0200 (Tue, 27 Apr 2010) | 2 lines Changed paths: M /trunk/build.vc9/Makefile.am M /trunk/build.vc9/assembler/Makefile.am A /trunk/build.vc9/tune/Makefile.am M /trunk/configure.in Added missing files for Visual C build ------------------------------------------------------------------------ r1505 | zimmerma | 2010-04-24 23:05:26 +0200 (Sat, 24 Apr 2010) | 2 lines Changed paths: M /trunk/TODO [TODO] updated done item ------------------------------------------------------------------------ r1504 | zimmerma | 2010-04-24 22:59:28 +0200 (Sat, 24 Apr 2010) | 3 lines Changed paths: M /trunk/configure.in M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/sp.h M /trunk/spm.c implemented fast conversion from mpz_t to RNS (hard-coded threshold is 2^7 moduli, which seems close to optimal on a 64-bit machine) ------------------------------------------------------------------------ r1502 | kruppa | 2010-04-19 11:19:49 +0200 (Mon, 19 Apr 2010) | 3 lines Changed paths: M /trunk/ChangeLog Last commits for 6.3 added ------------------------------------------------------------------------ r1501 | kruppa | 2010-04-18 22:14:22 +0200 (Sun, 18 Apr 2010) | 5 lines Version 6.3. Assertions off by default. Added new thresholds to 64-bit parameter files in build.vc9/. ------------------------------------------------------------------------ r1500 | kruppa | 2010-04-17 01:40:06 +0200 (Sat, 17 Apr 2010) | 4 lines Include ecm-params.h.core2 in distribution Updated record factor sizes ------------------------------------------------------------------------ r1499 | kruppa | 2010-04-17 01:26:15 +0200 (Sat, 17 Apr 2010) | 4 lines Specify source dir path for /powerpc64/powerpc-defs.m4 include to make out-of-source builds work ------------------------------------------------------------------------ r1498 | kruppa | 2010-04-16 23:41:03 +0200 (Fri, 16 Apr 2010) | 4 lines Detect Core 2 cpus if /proc/cpuinfo exists and use correct parameter file; warn about choosing right paramters if no /proc/cpuinfo exists on x86_64 ------------------------------------------------------------------------ r1497 | kruppa | 2010-04-16 18:11:31 +0200 (Fri, 16 Apr 2010) | 4 lines Threshold 100 for using multi-threading in mpzspv_from_mpzv() was too low for ECM, actually increased run-time. Threshold increased to 16384 ------------------------------------------------------------------------ r1496 | kruppa | 2010-04-15 12:11:20 +0200 (Thu, 15 Apr 2010) | 2 lines Add underscores to abort call on systems that need it ------------------------------------------------------------------------ r1495 | kruppa | 2010-04-14 23:16:48 +0200 (Wed, 14 Apr 2010) | 2 lines Test for mpn_sqr() and if it doesn't exist, fall back to mpn_mul() ------------------------------------------------------------------------ r1494 | kruppa | 2010-04-13 19:15:12 +0200 (Tue, 13 Apr 2010) | 4 lines Fixed missing return value that caused compiler warning Removed OMP critical that was left over from debugging ------------------------------------------------------------------------ r1493 | kruppa | 2010-04-11 21:46:36 +0200 (Sun, 11 Apr 2010) | 3 lines Small changes: check if --with-gmp dir exists, clarification of comment, remove duplicate case ------------------------------------------------------------------------ r1492 | kruppa | 2010-04-11 00:44:00 +0200 (Sun, 11 Apr 2010) | 4 lines Get addresses of input operands to mpres_mul() after realloc for result. Input operands may change address if input and output are the same mpz_t, and reading from the old, now free()'d, address makes valgrind unhappy ------------------------------------------------------------------------ r1491 | kruppa | 2010-04-10 23:45:24 +0200 (Sat, 10 Apr 2010) | 2 lines ASSERT input < modulus was too strict, NTT code itself does not fully reduce ------------------------------------------------------------------------ r1490 | kruppa | 2010-04-10 23:34:11 +0200 (Sat, 10 Apr 2010) | 2 lines Fixed missing mod reductions that could lead to NTT overflows ------------------------------------------------------------------------ r1489 | zimmerma | 2010-04-09 20:57:52 +0200 (Fri, 09 Apr 2010) | 2 lines [TODO] corrected item about mpn_redc_2 ------------------------------------------------------------------------ r1488 | zimmerma | 2010-04-09 19:45:57 +0200 (Fri, 09 Apr 2010) | 2 lines [TODO] forgot to commit 2nd item in last change ------------------------------------------------------------------------ r1487 | zimmerma | 2010-04-09 19:45:29 +0200 (Fri, 09 Apr 2010) | 2 lines [TODO] added two items suggested by T. Granlund ------------------------------------------------------------------------ r1486 | zimmerma | 2010-04-09 19:31:09 +0200 (Fri, 09 Apr 2010) | 2 lines bumped version number to 6.3-rc4 (just to try "make dist") ------------------------------------------------------------------------ r1485 | zimmerma | 2010-04-09 19:25:07 +0200 (Fri, 09 Apr 2010) | 4 lines [ecm-params.h.alpha-ev56] new parameter file for Alpha ev56 [Makefile.am] added ecm-params.h.alpha-ev56 in "make dist" [configure.in] now takes new default parameter files ------------------------------------------------------------------------ r1484 | kruppa | 2010-04-09 19:10:09 +0200 (Fri, 09 Apr 2010) | 4 lines Small changes to make building outside of the source directory work. Make libmulredc.a depend on config.m4 in all asm subdirectories. ------------------------------------------------------------------------ r1483 | zimmerma | 2010-04-09 19:06:09 +0200 (Fri, 09 Apr 2010) | 2 lines [README.dev] added item to make a new release ------------------------------------------------------------------------ r1482 | zimmerma | 2010-04-09 19:03:13 +0200 (Fri, 09 Apr 2010) | 3 lines [ecm-params.h.pentium4] updated parameters for ecm-6.3 [Makefile.am] added hppa2.0 default parameters ------------------------------------------------------------------------ r1481 | kruppa | 2010-04-09 17:44:31 +0200 (Fri, 09 Apr 2010) | 4 lines Looks like gas wants "#" as comment separater and Apple Mac OS X assembler wants ";" so now we use M4 to discard comments ------------------------------------------------------------------------ r1480 | kruppa | 2010-04-09 16:13:09 +0200 (Fri, 09 Apr 2010) | 4 lines Make -printconfig print whether Windows ABI is used for assembler functions Q&D (and #define'd out) test of mulredc1_*() functions ------------------------------------------------------------------------ r1479 | kruppa | 2010-04-09 16:02:40 +0200 (Fri, 09 Apr 2010) | 3 lines Likewise fixed incorrect reading of inv_m and missing quotes ------------------------------------------------------------------------ r1478 | kruppa | 2010-04-09 15:27:37 +0200 (Fri, 09 Apr 2010) | 6 lines Fixed incorrect quoting which broke the asserts and the switch to MS ABI in mulredc.m4. Fixed incorrect reading of inv_m from the stack. No longer include x86_64/redc.asm in build or distribution. ------------------------------------------------------------------------ r1477 | zimmerma | 2010-04-08 16:07:04 +0200 (Thu, 08 Apr 2010) | 2 lines [ecm-params.h.hppa2.0] parameter file for hppa2.0 ------------------------------------------------------------------------ r1476 | zimmerma | 2010-04-08 15:19:34 +0200 (Thu, 08 Apr 2010) | 2 lines [sp.h] fix to make umul_ppmm() work properly on hppa2.0 (gcc61.fsffrance.org) ------------------------------------------------------------------------ r1475 | zimmerma | 2010-04-08 15:07:29 +0200 (Thu, 08 Apr 2010) | 2 lines [ecm-params.h.pentium-m] updated ------------------------------------------------------------------------ r1474 | zimmerma | 2010-04-07 17:48:22 +0200 (Wed, 07 Apr 2010) | 3 lines [ecm-params.h.ia64] new parameter file [Makefile.am] added ecm-params.h.ia64 in make dist ------------------------------------------------------------------------ r1473 | kruppa | 2010-04-07 17:36:19 +0200 (Wed, 07 Apr 2010) | 6 lines Changed comments from C++ style "//" to assembler style "#" to avoid .S files which cause trouble on case-insensitive filesystems. Removed mulredc*.asm files from SVN as these are generated code. Added rules to Makefile.am to generate the mulredc*.asm files. ------------------------------------------------------------------------ r1472 | zimmerma | 2010-04-07 15:13:59 +0200 (Wed, 07 Apr 2010) | 3 lines [ecm-params.h.sparc64] parameter file for sparc64 [Makefile.am] added ecm-params.h.sparc64 to make dist ------------------------------------------------------------------------ r1471 | zimmerma | 2010-04-07 14:00:27 +0200 (Wed, 07 Apr 2010) | 2 lines [Makefile.am] added new parameter files to make dist ------------------------------------------------------------------------ r1470 | zimmerma | 2010-04-07 13:52:53 +0200 (Wed, 07 Apr 2010) | 2 lines [ecm-params.h.athlon] added comment (this is for Opteron) ------------------------------------------------------------------------ r1469 | zimmerma | 2010-04-07 13:04:26 +0200 (Wed, 07 Apr 2010) | 3 lines [ecm-params.h.armv5tel] new parameter file for ARM [ecm-params.h.mips64el,ecm-params.h.powerpc970] added version of GMP used ------------------------------------------------------------------------ r1468 | zimmerma | 2010-04-07 11:12:54 +0200 (Wed, 07 Apr 2010) | 2 lines [ecm-params.h.mips64el] new parameter file for MIPS64 ------------------------------------------------------------------------ r1467 | zimmerma | 2010-04-07 09:56:05 +0200 (Wed, 07 Apr 2010) | 2 lines [ecm-params.h.powerpc970] updated tuning parameters for 6.3 ------------------------------------------------------------------------ r1466 | kruppa | 2010-04-06 17:25:50 +0200 (Tue, 06 Apr 2010) | 3 lines Test if compiler understands __attribute__((hot)) at configure time Marked some hot-spot functions of ECM accordingly ------------------------------------------------------------------------ r1465 | kruppa | 2010-04-02 14:41:42 +0200 (Fri, 02 Apr 2010) | 3 lines Removed duplicate LT_PREREQ, lowered requirement from 2.2.6b to 2.2.6. ------------------------------------------------------------------------ r1464 | kruppa | 2010-04-01 17:16:34 +0200 (Thu, 01 Apr 2010) | 6 lines Separate thresholds for mulredc*() functions for squaring and general multiplication, since GMP mpn_sqr() is faster than mpn_mul_n() Use __gmpn_redc_1() function if configure finds it, although at this time it doesn't seem to be faster than the loop over mpn_addmul_1() ------------------------------------------------------------------------ r1463 | kruppa | 2010-03-31 11:35:48 +0200 (Wed, 31 Mar 2010) | 2 lines Print TUNE_MULREDC_THRESH with -printconfig ------------------------------------------------------------------------ r1462 | kruppa | 2010-03-31 10:27:19 +0200 (Wed, 31 Mar 2010) | 2 lines Fixed uninitialised variable in assertion ------------------------------------------------------------------------ r1461 | zimmerma | 2010-03-31 10:22:47 +0200 (Wed, 31 Mar 2010) | 2 lines [README.dev] added two reminders for making a new release ------------------------------------------------------------------------ r1460 | kruppa | 2010-03-31 01:53:51 +0200 (Wed, 31 Mar 2010) | 9 lines Switch assembly code to Windows ABI under MinGW Assembly redc3() function disabled on x86_84, GMP is faster New threshold for tuning: TUNE_MULREDC_THRESH, determines when to switch from asm mulredc*() functions to GMP Changed functions called by mpres_mul() to reduce call overhead Added "longcheck" target which runs the test scripts with different parameters and with valgrind, if configure found valgrind ------------------------------------------------------------------------ r1459 | kruppa | 2010-03-18 17:46:07 +0100 (Thu, 18 Mar 2010) | 3 lines Make asm code switch to Windows 64 ABI if WINDOWS64_ABI is defined in config.m4. Completely untested. mulredc1.m4 is to be done yet ------------------------------------------------------------------------ r1458 | kruppa | 2010-03-17 23:10:02 +0100 (Wed, 17 Mar 2010) | 2 lines Auto-generated files, should not be in SVN ------------------------------------------------------------------------ r1457 | kruppa | 2010-03-17 22:03:23 +0100 (Wed, 17 Mar 2010) | 2 lines Added -printconfig parameter ------------------------------------------------------------------------ r1456 | kruppa | 2010-03-17 19:01:40 +0100 (Wed, 17 Mar 2010) | 5 lines Try to import CC and CFLAGS from gmp.h (copied from MPFR 2.4.2) Detect SSE2 support by test compilation Use CCASFLAGS with CCAS Stricter quoting ------------------------------------------------------------------------ r1455 | kruppa | 2010-03-17 18:58:36 +0100 (Wed, 17 Mar 2010) | 3 lines Commented out .asm -> .S rule to avoid problems on case-insensitive filesystems ------------------------------------------------------------------------ r1454 | kruppa | 2010-03-16 17:47:49 +0100 (Tue, 16 Mar 2010) | 2 lines Truncating pointer conversion bug fixed, pointed out by David Cleaver ------------------------------------------------------------------------ r1453 | kruppa | 2010-03-16 17:34:39 +0100 (Tue, 16 Mar 2010) | 5 lines Change use of CFLAGS, CCASFLAGS and LDFLAGS when checking for underscores to match that in the resulting Makefile. Fixes error when -m64 was added to CCASFLAGS. ------------------------------------------------------------------------ r1452 | kruppa | 2010-03-10 22:15:44 +0100 (Wed, 10 Mar 2010) | 8 lines Use autoconf's test to determine how to enable OpenMP in the compiler. Enable OpenMP only for those targets that need it to avoid spurious library dependencies. Link GSL only to rho in test-drive mode to avoid spurious library dependency. Compile GSL-dependent code in rho.c only in test-drive mode. More consistent quoting in configure.in ------------------------------------------------------------------------ r1451 | kruppa | 2010-03-10 13:59:21 +0100 (Wed, 10 Mar 2010) | 3 lines Keep linking to GMP library while checking for GMP functions such as __gmpn_add_nc and __gmpn_mod_34lsub1 ------------------------------------------------------------------------ r1450 | zimmerma | 2010-03-09 11:35:21 +0100 (Tue, 09 Mar 2010) | 3 lines [ecm.c] clean up the code to print expected number of curves and time also changed to print from 35 to 80 digits instead of 20 to 65 ------------------------------------------------------------------------ r1449 | kruppa | 2010-03-07 21:53:40 +0100 (Sun, 07 Mar 2010) | 4 lines powerpc64 needs -m64 flag for gcc to produce 64 bit build. Without the flag, it produces a 32 bit build seemingly successfully, but the 64-bit mulredc asm code produces incorrect arithmetic. ------------------------------------------------------------------------ r1447 | kruppa | 2010-03-07 17:25:37 +0100 (Sun, 07 Mar 2010) | 8 lines Changed rules for building manpage ecm.1. Previously the man page would be built if ecm.1 was missing or was older than ecm.xml, even if xsltproc or docbook.xsl were not found by ./configure, causing "make" to exit with error. Furthermore, listing ecm.1 in two output variables would cause "make install" to try to install the same man page twice, leading to a warning. ------------------------------------------------------------------------ r1446 | kruppa | 2010-03-07 16:18:06 +0100 (Sun, 07 Mar 2010) | 2 lines File locking item removed, it's done ------------------------------------------------------------------------ r1445 | kruppa | 2010-03-07 02:23:44 +0100 (Sun, 07 Mar 2010) | 2 lines Changed paths: M /trunk/README Point out explicitly that -save saves after stage 2 ------------------------------------------------------------------------ r1444 | kruppa | 2010-03-07 00:47:56 +0100 (Sun, 07 Mar 2010) | 2 lines Changed paths: M /trunk/resume.c Process LF, CR, and CR/LF as newline when reading save files ------------------------------------------------------------------------ r1443 | kruppa | 2010-03-05 15:53:07 +0100 (Fri, 05 Mar 2010) | 2 lines Changed paths: M /trunk/INSTALL Refer to latest GMP version 5.0.1 ------------------------------------------------------------------------ r1442 | kruppa | 2010-03-03 17:18:47 +0100 (Wed, 03 Mar 2010) | 8 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in On Darwin x86_64 systems, see if we need to pass -m64 to gcc to get 64 bit code. Don't add the GMP library to LIBS, but let Makefile.am add it to LDADD instead, to avoid GMP getting copied into the GMP-ECM libraries which is non-portable and seems to break linking on Darwin systems. ------------------------------------------------------------------------ r1441 | kruppa | 2010-03-03 17:13:40 +0100 (Wed, 03 Mar 2010) | 5 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c M /trunk/rho.c M /trunk/schoen_strass.c Fixed operator precedence bug in schoen_strass.c Compile Buchstab_omega() in rho.c only with GSL to avoid warning Added mpres_equal() function ------------------------------------------------------------------------ r1440 | kruppa | 2010-03-03 15:48:45 +0100 (Wed, 03 Mar 2010) | 2 lines Changed paths: M /trunk/pm1fs2.c Hide omp critical pragma if OMP isn't used to avoid compiler warning ------------------------------------------------------------------------ r1439 | kruppa | 2010-02-15 18:34:20 +0100 (Mon, 15 Feb 2010) | 5 lines Changed paths: M /trunk/configure.in M /trunk/ecm-ecm.h M /trunk/main.c M /trunk/resume.c Removed some dead code from checkpoint writing. Changed writing save file lines always to append to the file, with file locking if fcntl() is available. ------------------------------------------------------------------------ r1438 | kruppa | 2010-02-10 15:17:53 +0100 (Wed, 10 Feb 2010) | 3 lines Changed paths: M /trunk/phiP.gp Slight cleanups ------------------------------------------------------------------------ r1437 | kruppa | 2010-02-10 14:41:32 +0100 (Wed, 10 Feb 2010) | 3 lines Changed paths: M /trunk/pm1fs2.c Previous commint included P values where code could not factor phi(P). Fixed ------------------------------------------------------------------------ r1436 | kruppa | 2010-02-10 13:41:29 +0100 (Wed, 10 Feb 2010) | 3 lines Changed paths: M /trunk/pm1fs2.c More P values to allow larger B2 in a single run ------------------------------------------------------------------------ r1435 | kruppa | 2010-02-10 01:49:03 +0100 (Wed, 10 Feb 2010) | 3 lines Changed paths: M /trunk/pm1fs2.c More parallelization while building f(x) ------------------------------------------------------------------------ r1434 | kruppa | 2010-02-01 17:39:17 +0100 (Mon, 01 Feb 2010) | 2 lines Changed paths: M /trunk/pm1fs2.c Cleanup in list_scale_V(), slightly more parallelization ------------------------------------------------------------------------ r1433 | kruppa | 2010-02-01 14:48:52 +0100 (Mon, 01 Feb 2010) | 3 lines Changed paths: M /trunk/bestd.c M /trunk/ecm-impl.h M /trunk/ks-multiply.c M /trunk/mpmod.c M /trunk/pm1fs2.c M /trunk/schoen_strass.c M /trunk/sp.h M /trunk/stage2.c Replaced __GMP_BITS_PER_MP_LIMB and most mp_bits_per_limb by GMP_NUMB_BITS ------------------------------------------------------------------------ r1432 | zimmerma | 2010-01-30 22:21:43 +0100 (Sat, 30 Jan 2010) | 2 lines Changed paths: M /trunk/ChangeLog M /trunk/INSTALL M /trunk/Makefile.am M /trunk/NEWS M /trunk/build.vc9/config.h M /trunk/ecm.h preparation for the release of ecm-6.3 ------------------------------------------------------------------------ r1431 | kruppa | 2010-01-23 20:55:47 +0100 (Sat, 23 Jan 2010) | 2 lines Changed paths: M /trunk/pm1fs2.c Make "one-pass" P+1 stage 2 use parallel transforms ------------------------------------------------------------------------ r1430 | kruppa | 2010-01-22 23:36:34 +0100 (Fri, 22 Jan 2010) | 5 lines Changed paths: M /trunk/mpmod.c Changed __GMP_BITS_PER_MP_LIMB to GMP_NUMB_BITS Bugfix in expensive assert check for mulredc Cleanup in powering functions ------------------------------------------------------------------------ r1429 | kruppa | 2010-01-22 21:42:31 +0100 (Fri, 22 Jan 2010) | 3 lines Changed paths: M /trunk/configure.in Enable asm mulredc by default only on x86_64 and 64 bit PowerPC Check for GSL ------------------------------------------------------------------------ r1428 | kruppa | 2010-01-22 21:23:19 +0100 (Fri, 22 Jan 2010) | 3 lines Changed paths: M /trunk/bench_mulredc.c Slightly more readable output ------------------------------------------------------------------------ r1427 | kruppa | 2010-01-20 14:20:32 +0100 (Wed, 20 Jan 2010) | 3 lines Changed paths: M /trunk/rho.gp Make results agree better with those from rho.c Added functions for P+1 prob, and for small B2 ------------------------------------------------------------------------ r1426 | zimmerma | 2010-01-08 19:03:00 +0100 (Fri, 08 Jan 2010) | 2 lines Changed paths: M /trunk/sp.h [sp.h] define __GMP_BITS_PER_MP_LIMB from GMP_LIMB_BITS when undefined ------------------------------------------------------------------------ r1425 | kruppa | 2009-11-06 03:31:28 +0100 (Fri, 06 Nov 2009) | 5 lines Changed paths: M /trunk/rho.c Functions for counting and estimating smooth and rough numbers. For small B2, estimate stage 2 probability with sum instead of integral. ------------------------------------------------------------------------ r1424 | kruppa | 2009-11-01 15:53:30 +0100 (Sun, 01 Nov 2009) | 2 lines Changed paths: M /trunk/Makefile.am Add target "rho" to test code in rho.c ------------------------------------------------------------------------ r1423 | kruppa | 2009-11-01 15:39:42 +0100 (Sun, 01 Nov 2009) | 3 lines Changed paths: M /trunk/rho.c Functions for computing \Phi(x,y) and \Psi(x,y) exactly, and an estimate for Phi(x,y) (number of y-rough numbers) ------------------------------------------------------------------------ r1422 | zimmerma | 2009-08-31 10:23:26 +0200 (Mon, 31 Aug 2009) | 3 lines Changed paths: M /trunk/tune.c [tune.c] replace obsolete mpz_random by mpz_urandomb (suggested by Jason Moxham) ------------------------------------------------------------------------ r1421 | brian_gladman | 2009-08-11 17:35:55 +0200 (Tue, 11 Aug 2009) | 1 line Changed paths: A /trunk/build.vc9/mp_lib.vsprops ------------------------------------------------------------------------ r1420 | brian_gladman | 2009-08-11 17:35:17 +0200 (Tue, 11 Aug 2009) | 1 line Changed paths: M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/readme.txt M /trunk/build.vc9/tests.py M /trunk/build.vc9/tune/tune.vcproj Update Windows VC++ build to use standard output directories and to use MPIR by default. ------------------------------------------------------------------------ r1419 | kruppa | 2009-05-28 21:48:48 +0200 (Thu, 28 May 2009) | 3 lines Changed paths: M /trunk/main.c Allow -printconfig as only parameter, exit after printing config. Print tuning parameters, too. ------------------------------------------------------------------------ r1418 | zimmerma | 2009-05-18 13:59:13 +0200 (Mon, 18 May 2009) | 2 lines Changed paths: M /trunk/mpzspv.c [mpzspv.c] get rid of malloc_usable_size (and thus malloc.h) ------------------------------------------------------------------------ r1417 | zimmerma | 2009-05-18 13:19:15 +0200 (Mon, 18 May 2009) | 2 lines Changed paths: M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/ks-multiply.c M /trunk/test.pm1 [ks-multiply.c] check allocation failure in kronecker_schonhage ------------------------------------------------------------------------ r1416 | zimmerma | 2009-05-18 13:08:21 +0200 (Mon, 18 May 2009) | 2 lines Changed paths: M /trunk/mpzspm.c [mpzspm.c] fixed typo ------------------------------------------------------------------------ r1415 | kruppa | 2009-05-17 23:20:05 +0200 (Sun, 17 May 2009) | 3 lines Changed paths: M /trunk/mpzspm.c M /trunk/pm1fs2.c M /trunk/sp.c M /trunk/spm.c M /trunk/stage2.c Added some error handling for out-of-memory conditions to NTT code ------------------------------------------------------------------------ r1414 | zimmerma | 2009-05-12 16:07:12 +0200 (Tue, 12 May 2009) | 2 lines Changed paths: M /trunk/build.vc9/assembler/test_mulredc.c M /trunk/ecm_ntt.c M /trunk/getprime.c M /trunk/main.c M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/polyeval.c M /trunk/rho.c M /trunk/schoen_strass.c M /trunk/sets_long.c M /trunk/sp.c M /trunk/spm.c M /trunk/stage2.c M /trunk/test_mulredc.c M /trunk/tune.c check return value of malloc in several places (bug reported by Torbjörn Granlund) ------------------------------------------------------------------------ r1413 | zimmerma | 2009-04-25 23:34:58 +0200 (Sat, 25 Apr 2009) | 2 lines Changed paths: M /trunk/NEWS [NEWS] updated with changes between ecm-6.2.2 and ecm-6.2.3 ------------------------------------------------------------------------ r1412 | kruppa | 2009-04-21 17:13:49 +0200 (Tue, 21 Apr 2009) | 2 lines Changed paths: M /trunk/mul_fft-params.h.default Missing endline at end of file ------------------------------------------------------------------------ r1411 | kruppa | 2009-04-20 00:02:05 +0200 (Mon, 20 Apr 2009) | 2 lines Changed paths: M /trunk/build.vc9/assembler/a_x64_mulredc.asm Ported recent improvements ------------------------------------------------------------------------ r1410 | kruppa | 2009-04-18 22:47:23 +0200 (Sat, 18 Apr 2009) | 4 lines Changed paths: M /trunk/configure.in Cleanup of tests for asm redc code, test list of cpu types only once Print configuration at end of configure Various cleanups (or messups, as the case may be) ------------------------------------------------------------------------ r1409 | zimmerma | 2009-04-18 19:01:08 +0200 (Sat, 18 Apr 2009) | 2 lines Changed paths: M /trunk/TODO [TODO] added item ------------------------------------------------------------------------ r1408 | kruppa | 2009-04-18 17:21:14 +0200 (Sat, 18 Apr 2009) | 3 lines Changed paths: M /trunk/main.c Fixed old bug: if last line did not end in newline, only one curve would be run on that number in spite of -c parameter ------------------------------------------------------------------------ r1407 | kruppa | 2009-04-18 16:03:45 +0200 (Sat, 18 Apr 2009) | 3 lines Changed paths: M /trunk/main.c Added -printconfig option which prints configuration optinons use for building GMP-ECM ------------------------------------------------------------------------ r1406 | zimmerma | 2009-04-18 15:40:44 +0200 (Sat, 18 Apr 2009) | 2 lines Changed paths: M /trunk/main.c [main.c] print --enable-asm-redc in header line if used ------------------------------------------------------------------------ r1405 | zimmerma | 2009-04-18 15:07:04 +0200 (Sat, 18 Apr 2009) | 2 lines Changed paths: M /trunk/README.dev [README.dev] added item ------------------------------------------------------------------------ r1404 | zimmerma | 2009-04-18 15:03:09 +0200 (Sat, 18 Apr 2009) | 3 lines Changed paths: M /trunk/configure.in [configure.in] applied patch from Peter Jeremy (http://gforge.inria.fr/tracker/index.php?func=detail&aid=7639&group_id=135&atid=623) ------------------------------------------------------------------------ r1403 | kruppa | 2009-04-16 17:51:46 +0200 (Thu, 16 Apr 2009) | 3 lines Changed paths: M /trunk/INSTALL Added warning about incompatible GMP header/library Updated GMP version to 4.3.0 ------------------------------------------------------------------------ r1402 | kruppa | 2009-04-16 17:21:21 +0200 (Thu, 16 Apr 2009) | 2 lines Changed paths: M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc2.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm Moving memory load ahead was missing for last unroll step ------------------------------------------------------------------------ r1401 | kruppa | 2009-04-16 16:39:05 +0200 (Thu, 16 Apr 2009) | 2 lines Changed paths: M /trunk/README Replaced long obsolete files list by basic usage examples ------------------------------------------------------------------------ r1400 | kruppa | 2009-04-16 16:32:19 +0200 (Thu, 16 Apr 2009) | 3 lines Changed paths: M /trunk/configure.in GMP 4.3.0 and newer always have three parts in version string (including patchlevel, even if it's zero). This broke a test in configure ------------------------------------------------------------------------ r1399 | kruppa | 2009-04-10 22:50:33 +0200 (Fri, 10 Apr 2009) | 4 lines Changed paths: M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm Moved memory load ahead one instruction. Slight speedup on Opteron/Phenom (~1% for 20 words), noticable speedup on Core 2 (~8% for 20 words) ------------------------------------------------------------------------ r1398 | zimmerma | 2009-04-01 18:25:52 +0200 (Wed, 01 Apr 2009) | 2 lines Changed paths: M /trunk/hecm/Makefile [hecm/Makefile] link against the libecm.a from .., not /usr/lib/libecm.a! ------------------------------------------------------------------------ r1397 | zimmerma | 2009-04-01 17:00:18 +0200 (Wed, 01 Apr 2009) | 2 lines Changed paths: M /trunk/README.dev [README.dev] check config.guess is recent enough ------------------------------------------------------------------------ r1396 | zimmerma | 2009-04-01 16:39:47 +0200 (Wed, 01 Apr 2009) | 2 lines Changed paths: D /trunk/configfsf.guess D /trunk/configfsf.sub [configfsf.guess,configfsf.sub] removed unused files ------------------------------------------------------------------------ r1395 | zimmerma | 2009-04-01 14:18:03 +0200 (Wed, 01 Apr 2009) | 2 lines Changed paths: M /trunk/README.dev A /trunk/patch-config.guess.diff [patch-config.guess.diff] patch to fix config.guess on Mac OS X / PowerPC ------------------------------------------------------------------------ r1394 | kruppa | 2009-03-30 15:50:08 +0200 (Mon, 30 Mar 2009) | 4 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm2.c M /trunk/stage2.c Code to find factor of group order of elliptic curve in stage 2 could segfault if factor (of input number) was re-discovered during initialisation of arithmetic progressions on the curve ------------------------------------------------------------------------ r1393 | kruppa | 2009-03-30 15:44:52 +0200 (Mon, 30 Mar 2009) | 2 lines Changed paths: M /trunk/ecm.c Bugfix: rhotable did not get freed if a factor was found in stage 1 ------------------------------------------------------------------------ r1392 | brian_gladman | 2009-03-29 18:16:04 +0200 (Sun, 29 Mar 2009) | 1 line Changed paths: M /trunk/build.vc9/ecm.sln M /trunk/build.vc9/tune/tune.vcproj correct error in Windows tune builld ------------------------------------------------------------------------ r1391 | brian_gladman | 2009-03-29 13:44:30 +0200 (Sun, 29 Mar 2009) | 1 line Changed paths: D /trunk/build.vc9/ecm-params.win32.amd.h D /trunk/build.vc9/ecm-params.win32.intel.h D /trunk/build.vc9/ecm-params.x64.amd.h D /trunk/build.vc9/ecm-params.x64.intel.h ------------------------------------------------------------------------ r1390 | brian_gladman | 2009-03-29 13:43:13 +0200 (Sun, 29 Mar 2009) | 1 line Changed paths: A /trunk/build.vc9/ecm-params.h.win32.amd A /trunk/build.vc9/ecm-params.h.win32.intel A /trunk/build.vc9/ecm-params.h.x64.amd A /trunk/build.vc9/ecm-params.h.x64.intel M /trunk/build.vc9/libecm/libecm.vcproj A /trunk/build.vc9/mul_fft-params.h.win32.amd A /trunk/build.vc9/mul_fft-params.h.win32.intel A /trunk/build.vc9/mul_fft-params.h.x64.amd A /trunk/build.vc9/mul_fft-params.h.x64.intel M /trunk/build.vc9/readme.txt M /trunk/build.vc9/tune/tune.vcproj Further update for Windows build ------------------------------------------------------------------------ r1389 | brian_gladman | 2009-03-29 00:05:29 +0100 (Sun, 29 Mar 2009) | 1 line Changed paths: M /trunk/build.vc9/tests.py correct test.py for new directory structure ------------------------------------------------------------------------ r1388 | brian_gladman | 2009-03-28 23:59:27 +0100 (Sat, 28 Mar 2009) | 2 lines Changed paths: M /trunk/build.vc9/assembler/a_x64_mulredc.asm A /trunk/build.vc9/assembler/mulredc.h M /trunk/build.vc9/config.h M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/ecm-params.h A /trunk/build.vc9/ecm-params.win32.amd.h A /trunk/build.vc9/ecm-params.win32.intel.h A /trunk/build.vc9/ecm-params.x64.amd.h A /trunk/build.vc9/ecm-params.x64.intel.h M /trunk/build.vc9/ecm.sln M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/readme.txt M /trunk/build.vc9/tests.py A /trunk/build.vc9/tune A /trunk/build.vc9/tune/tune.vcproj 1. Add tune to the windows GMP-ECM build 2. Add AMD and Intel build configurations (using tune output). ------------------------------------------------------------------------ r1387 | kruppa | 2009-03-28 22:53:42 +0100 (Sat, 28 Mar 2009) | 2 lines Changed paths: M /trunk/INSTALL Updated note, since --enable-asm-redc is default now ------------------------------------------------------------------------ r1386 | brian_gladman | 2009-03-28 17:48:45 +0100 (Sat, 28 Mar 2009) | 1 line Changed paths: M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/tests.py M /trunk/ntt_gfp.c M /trunk/spv.c Convert SSE2 inline assembler for 32-bit Windows build ------------------------------------------------------------------------ r1385 | zimmerma | 2009-03-28 12:59:04 +0100 (Sat, 28 Mar 2009) | 2 lines Changed paths: M /trunk/README.dev [README.dev] more about INSTALL up-to-date ------------------------------------------------------------------------ r1384 | zimmerma | 2009-03-28 12:56:30 +0100 (Sat, 28 Mar 2009) | 2 lines Changed paths: M /trunk/INSTALL [INSTALL] latest release of GMP is 4.2.4 ------------------------------------------------------------------------ r1383 | kruppa | 2009-03-28 12:16:00 +0100 (Sat, 28 Mar 2009) | 2 lines Changed paths: M /trunk/INSTALL Added mention of --enable-asm-redc and --enable-sse2 ------------------------------------------------------------------------ r1382 | kruppa | 2009-03-28 11:56:24 +0100 (Sat, 28 Mar 2009) | 3 lines Changed paths: M /trunk/configure.in Enable SSE2 if config.guess identifies the system as "i786" which seems to be what recent autotools call a Pentium 4 (rather than pentium4 as before) ------------------------------------------------------------------------ r1381 | rcosset | 2009-03-27 10:24:54 +0100 (Fri, 27 Mar 2009) | 1 line Changed paths: M /trunk/hecm/auxi.h M /trunk/hecm/hecm.c M /trunk/hecm/morphismes.c Correction of typos in the comments of hecm ------------------------------------------------------------------------ r1380 | zimmerma | 2009-03-26 15:17:15 +0100 (Thu, 26 Mar 2009) | 4 lines Changed paths: M /trunk/test.ecm M /trunk/test.pm1 M /trunk/test.pp1 [test.pm1] added new test, updated copyright years, added comments for return code [test.pp1,test.ecm] updated copyright years ------------------------------------------------------------------------ r1379 | kruppa | 2009-03-25 13:42:39 +0100 (Wed, 25 Mar 2009) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in Removed the horrible FASTOBJ kludge. There must be a better way to specify the order in which object files are to appear on the link command. ------------------------------------------------------------------------ r1378 | kruppa | 2009-03-24 17:00:44 +0100 (Tue, 24 Mar 2009) | 3 lines Changed paths: M /trunk/mpmod.c Bugfix: in ecm_redc_n(), a carry was not propagated correctly if xp[n - 1] was zero but tp[n - 1] was non-zero. ------------------------------------------------------------------------ r1377 | rcosset | 2009-03-24 11:51:01 +0100 (Tue, 24 Mar 2009) | 1 line Changed paths: M /trunk/hecm/hecm.c Print the imput number in hecm. ------------------------------------------------------------------------ r1376 | rcosset | 2009-03-24 11:40:39 +0100 (Tue, 24 Mar 2009) | 1 line Changed paths: A /trunk/hecm A /trunk/hecm/Jacobi.c A /trunk/hecm/Jacobi.h A /trunk/hecm/Makefile A /trunk/hecm/ariKS.c A /trunk/hecm/ariKS.h A /trunk/hecm/auxi.c A /trunk/hecm/auxi.h A /trunk/hecm/generation.c A /trunk/hecm/generation.h A /trunk/hecm/hecm.c A /trunk/hecm/hecm.h A /trunk/hecm/morphismes.c A /trunk/hecm/morphismes.h A /trunk/hecm/stage1HECM.c A /trunk/hecm/stage2HECM.c Added a new software gmp-hecm based on gmp-ecm. HECM used decomposable hyperelliptic curves of genus 2 instead of elliptic curves. Thus it does two run of ECM in parallel. The used of Kummer surfaces with small parameters make it quicker for numbers >= 10^300. ------------------------------------------------------------------------ r1375 | kruppa | 2009-03-23 15:23:23 +0100 (Mon, 23 Mar 2009) | 2 lines Changed paths: M /trunk/NEWS M /trunk/configure.in Updated NEWS with 6.2.2 release, bumped version to 6.3 ------------------------------------------------------------------------ r1373 | kruppa | 2009-03-23 00:00:25 +0100 (Mon, 23 Mar 2009) | 3 lines Changed paths: M /trunk/ntt_gfp.c M /trunk/spv.c MacOS assembler doesn't like binary constants in asm code, replaced by hex. Patch supplied by "jedirock" on mersenneforum.org ------------------------------------------------------------------------ r1372 | rcosset | 2009-03-19 14:01:26 +0100 (Thu, 19 Mar 2009) | 1 line Changed paths: M /trunk/mpmod.c Added a few commentary in mpmod.c ------------------------------------------------------------------------ r1371 | zimmerma | 2009-03-18 17:42:46 +0100 (Wed, 18 Mar 2009) | 2 lines Changed paths: M /trunk/mpmod.c [mpmod.c] added some FIXME's ------------------------------------------------------------------------ r1370 | kruppa | 2009-03-18 01:13:21 +0100 (Wed, 18 Mar 2009) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c Merged in Romain's modifications ------------------------------------------------------------------------ r1369 | kruppa | 2009-03-18 00:20:59 +0100 (Wed, 18 Mar 2009) | 6 lines Changed paths: M /trunk/x86_64/Makefile.am M /trunk/x86_64/mulredc.h M /trunk/x86_64/mulredc.m4 D /trunk/x86_64/mulredc1.h M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc2.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm Removed mulredc1.h, prototypes are in mulredc.h now Removed superfluous #include in mulredc.h Some small changes in mulredc.m4: replace some movq by movl, change for-loop so it works with LENGTH=2 ------------------------------------------------------------------------ r1368 | kruppa | 2009-03-17 23:34:51 +0100 (Tue, 17 Mar 2009) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/athlon/Makefile.am M /trunk/pentium4/Makefile.am M /trunk/powerpc64/Makefile.am M /trunk/x86_64/Makefile.am The include path to mulredc.h must use $(srcdir) or a build with separate source/build directories (as used by make distcheck) fails ------------------------------------------------------------------------ r1367 | kruppa | 2009-03-17 22:16:56 +0100 (Tue, 17 Mar 2009) | 5 lines Changed paths: M /trunk/configure.in New config.guess don't call a Pentium 4 "pentium4" any more, but "i786", the corresponding case was missing in the first test for asm eligibility. And autoconf changes the m4 quote characters from ` and ' to [ and ], so they need to be quoted for shell character-set matching. I hate autotools ------------------------------------------------------------------------ r1366 | kruppa | 2009-03-17 20:54:47 +0100 (Tue, 17 Mar 2009) | 6 lines Changed paths: M /trunk/Makefile.am D /trunk/asmredc.h A /trunk/bench_mulredc.c M /trunk/configure.in M /trunk/mpmod.c A /trunk/test_mulredc.c Build mulredc code in subdirectories if enabled, and link to it from the top directory Moved bench_mulredc.c and test_mulredc.c to top directory rather than having identical copies in each subdir Did I mention that I hate autotools? ------------------------------------------------------------------------ r1365 | kruppa | 2009-03-17 20:45:30 +0100 (Tue, 17 Mar 2009) | 4 lines Changed paths: D /trunk/config.guess D /trunk/config.sub Removed config.guess and config.sub, they aren't sources and developers with different versions of autotools installed will overwrite each others' config.guess/config.sub all the time ------------------------------------------------------------------------ r1364 | kruppa | 2009-03-17 20:41:07 +0100 (Tue, 17 Mar 2009) | 4 lines Changed paths: M /trunk/athlon/Makefile.am M /trunk/athlon/mulredc.h M /trunk/pentium4/Makefile.am M /trunk/powerpc64/Makefile.am D /trunk/powerpc64/bench.c A /trunk/powerpc64/mulredc.h D /trunk/powerpc64/test_mulredc.c Build mulredc as library in powerpc64 Removed extaneous prototypes, EXTRA_DIST entries ------------------------------------------------------------------------ r1363 | kruppa | 2009-03-17 20:34:31 +0100 (Tue, 17 Mar 2009) | 3 lines Changed paths: M /trunk/pentium4/Makefile.am D /trunk/pentium4/bench.c A /trunk/pentium4/mulredc.h D /trunk/pentium4/test_mulredc.c Build mulredc as library ------------------------------------------------------------------------ r1362 | kruppa | 2009-03-17 20:28:22 +0100 (Tue, 17 Mar 2009) | 2 lines Changed paths: M /trunk/athlon/Makefile.am D /trunk/athlon/bench.c A /trunk/athlon/mulredc.h D /trunk/athlon/test_mulredc.c Build mulredc as library ------------------------------------------------------------------------ r1361 | kruppa | 2009-03-17 20:08:17 +0100 (Tue, 17 Mar 2009) | 2 lines Changed paths: M /trunk/x86_64/Makefile.am Forgot to add mulredc.h to distribution ------------------------------------------------------------------------ r1360 | kruppa | 2009-03-17 19:55:46 +0100 (Tue, 17 Mar 2009) | 3 lines Changed paths: M /trunk/x86_64/Makefile.am D /trunk/x86_64/bench.c A /trunk/x86_64/mulredc.h D /trunk/x86_64/test_mulredc.c Build mulredc as library, so GMP-ECM can link it (with ld rather than ln) Removed bench and test_mulredc, are being moved to parent dir ------------------------------------------------------------------------ r1359 | kruppa | 2009-03-17 15:16:40 +0100 (Tue, 17 Mar 2009) | 2 lines Changed paths: D /trunk/x86_64/Makefile.dev The targets from Makefile.dev are now in Makefile.am ------------------------------------------------------------------------ r1358 | zimmerma | 2009-03-17 11:00:52 +0100 (Tue, 17 Mar 2009) | 6 lines Changed paths: A /trunk/m4 added empty directory m4, since autoreconf seems to require it: patate% autoreconf -i aclocal: couldn't open directory `m4': No such file or directory autoreconf: aclocal failed with exit status: 1 ------------------------------------------------------------------------ r1357 | kruppa | 2009-03-17 00:00:55 +0100 (Tue, 17 Mar 2009) | 3 lines Changed paths: M /trunk/x86_64/redc.asm Use RIP-relative addressing instead of horrible call/pop for computed jump. Fixes a compilation error on MacOS ------------------------------------------------------------------------ r1356 | kruppa | 2009-03-15 22:58:34 +0100 (Sun, 15 Mar 2009) | 2 lines Changed paths: M /trunk/configure.in Added i686-apple-darwin* to list of hosts that understand asm code ------------------------------------------------------------------------ r1355 | kruppa | 2009-03-12 16:02:33 +0100 (Thu, 12 Mar 2009) | 5 lines Changed paths: M /trunk/tune.c Fixed generation of NTT_GFP_TWIDDLE_DI[FT]_BREAKOVER values Avoid calling cputime() excessively often when timing short functions Fixed access to uninitialised memory ------------------------------------------------------------------------ r1354 | kruppa | 2009-03-12 15:53:28 +0100 (Thu, 12 Mar 2009) | 2 lines Changed paths: M /trunk/ecm-params.h.powerpc970 NTT_GFP_TWIDDLE_DI[FT]_BREAKOVER was not in log_2() form ------------------------------------------------------------------------ r1353 | kruppa | 2009-03-08 20:49:41 +0100 (Sun, 08 Mar 2009) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.pentium3 M /trunk/ecm-params.h.pentium4 A /trunk/mul_fft-params.h.athlon64 A /trunk/mul_fft-params.h.default A /trunk/mul_fft-params.h.pentium3 A /trunk/mul_fft-params.h.pentium4 M /trunk/mul_fft.c Moved parameters for Schönhage-Strassen into separate file so that tune output does not overwrite them ------------------------------------------------------------------------ r1352 | zimmerma | 2009-03-04 14:45:34 +0100 (Wed, 04 Mar 2009) | 2 lines Changed paths: M /trunk/ecmfactor2.c [ecmfactor2.c] added wrapper function, and fixed example which did not work ------------------------------------------------------------------------ r1351 | zimmerma | 2009-03-03 15:32:13 +0100 (Tue, 03 Mar 2009) | 2 lines Changed paths: M /trunk/main.c [main.c] update champions sizes ------------------------------------------------------------------------ r1350 | zimmerma | 2009-02-27 17:34:43 +0100 (Fri, 27 Feb 2009) | 2 lines Changed paths: M /trunk/NEWS M /trunk/configure.in M /trunk/powerpc64/powerpc-defs.m4 the powerpc64 assembly code from Philip McLaughlin now works on Linux too ------------------------------------------------------------------------ r1349 | zimmerma | 2009-02-27 16:56:18 +0100 (Fri, 27 Feb 2009) | 2 lines Changed paths: M /trunk/acinclude.m4 [acinclude.m4] too many quotes ------------------------------------------------------------------------ r1348 | zimmerma | 2009-02-27 16:45:36 +0100 (Fri, 27 Feb 2009) | 3 lines Changed paths: M /trunk/acinclude.m4 M /trunk/configure.in M /trunk/powerpc64/Makefile.am M /trunk/powerpc64/Makefile.dev M /trunk/powerpc64/mulredc.m4 M /trunk/powerpc64/mulredc1.asm M /trunk/powerpc64/mulredc10.asm M /trunk/powerpc64/mulredc11.asm M /trunk/powerpc64/mulredc12.asm M /trunk/powerpc64/mulredc13.asm M /trunk/powerpc64/mulredc14.asm M /trunk/powerpc64/mulredc15.asm M /trunk/powerpc64/mulredc16.asm M /trunk/powerpc64/mulredc17.asm M /trunk/powerpc64/mulredc18.asm M /trunk/powerpc64/mulredc19.asm M /trunk/powerpc64/mulredc2.asm M /trunk/powerpc64/mulredc20.asm M /trunk/powerpc64/mulredc3.asm M /trunk/powerpc64/mulredc4.asm M /trunk/powerpc64/mulredc5.asm M /trunk/powerpc64/mulredc6.asm M /trunk/powerpc64/mulredc7.asm M /trunk/powerpc64/mulredc8.asm M /trunk/powerpc64/mulredc9.asm M /trunk/powerpc64/mulredc_1_2.m4 A /trunk/powerpc64/powerpc-defs.m4 M /trunk/powerpc64/redc.asm adapt PowerPC assembly files so that they can be used under Linux too (does not yet work) ------------------------------------------------------------------------ r1347 | kruppa | 2009-02-25 14:51:06 +0100 (Wed, 25 Feb 2009) | 2 lines Changed paths: M /trunk/pp1.c Replaced some mpres_mul_ui() by 2 with mpres_add() ------------------------------------------------------------------------ r1346 | kruppa | 2009-02-24 23:18:20 +0100 (Tue, 24 Feb 2009) | 2 lines Changed paths: M /trunk/x86_64/bench.c Fixed stupid error message about label at end of compound statement ------------------------------------------------------------------------ r1345 | kruppa | 2009-02-24 23:16:05 +0100 (Tue, 24 Feb 2009) | 5 lines Changed paths: M /trunk/x86_64/redc.asm If a redc.s file was generated somehow, it was not run through the C preprocessor before assembly (only .S fils are), causing address generation for a computed jump to go wrong. Added a tripwire so that assembly fails if preprocessor isn't used. ------------------------------------------------------------------------ r1344 | kruppa | 2009-02-24 19:49:27 +0100 (Tue, 24 Feb 2009) | 2 lines Changed paths: M /trunk/x86_64/Makefile.am M /trunk/x86_64/bench.c A /trunk/x86_64/mulredc1.h A /trunk/x86_64/mulredc1.m4 M /trunk/x86_64/test_mulredc.c For Romain: mulredc code for n x 1 products ------------------------------------------------------------------------ r1343 | kruppa | 2009-02-21 22:17:27 +0100 (Sat, 21 Feb 2009) | 2 lines Changed paths: D /trunk/powerpc64/Makefile.in Makefile.in should not be in repository, only Makefile.am ------------------------------------------------------------------------ r1342 | zimmerma | 2009-02-18 23:06:52 +0100 (Wed, 18 Feb 2009) | 3 lines Changed paths: M /trunk/tune.c [tune.c] NTT_GFP_TWIDDLE_DIF_BREAKOVER/NTT_GFP_TWIDDLE_DIT_BREAKOVER should be the logarithm in base 2 of the corresponding thresholds ------------------------------------------------------------------------ r1341 | kruppa | 2009-02-16 17:03:05 +0100 (Mon, 16 Feb 2009) | 4 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1fs2.c M /trunk/sets_long.c Moved maxS() from pm1fs2.c to sets_long.c as sets_max(), as it depends on how sets_long.c picks sets. Included test of sets_max() in the self-test. Fixed some typos in comments. ------------------------------------------------------------------------ r1340 | kruppa | 2009-02-15 16:09:34 +0100 (Sun, 15 Feb 2009) | 2 lines Changed paths: D /trunk/m4 Delete empty directory ------------------------------------------------------------------------ r1339 | zimmerma | 2009-02-12 10:58:27 +0100 (Thu, 12 Feb 2009) | 2 lines Changed paths: M /trunk/Makefile.am [Makefile.am] forgot ecm-params.h.powerpc970 in make dist ------------------------------------------------------------------------ r1338 | kruppa | 2009-02-11 23:20:32 +0100 (Wed, 11 Feb 2009) | 4 lines Changed paths: M /trunk/x86_64/bench.c M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm Added Phil McLaughlin's suggestion to remove a useless carry propagation. Made bench.c use getrusage() if available which has better resolution than clock() or times(). ------------------------------------------------------------------------ r1337 | kruppa | 2009-02-11 15:26:46 +0100 (Wed, 11 Feb 2009) | 2 lines Changed paths: M /trunk/x86_64/Makefile.am Add targets for bench and test_mulredc ------------------------------------------------------------------------ r1336 | kruppa | 2009-02-11 15:18:50 +0100 (Wed, 11 Feb 2009) | 2 lines Changed paths: M /trunk/x86_64/bench.c M /trunk/x86_64/test_mulredc.c Made bench compile again, fixed some -pedantic warnings in test_mulredc.c ------------------------------------------------------------------------ r1335 | zimmerma | 2009-02-11 10:00:35 +0100 (Wed, 11 Feb 2009) | 2 lines Changed paths: M /trunk/powerpc64/README [powerpc64/README] added reference to LGPL license ------------------------------------------------------------------------ r1334 | zimmerma | 2009-02-10 09:58:20 +0100 (Tue, 10 Feb 2009) | 2 lines Changed paths: A /trunk/ecm-params.h.powerpc970 [ecm-params.h.powerpc970] default tuning parameters for powerpc64 ------------------------------------------------------------------------ r1333 | zimmerma | 2009-02-10 09:50:43 +0100 (Tue, 10 Feb 2009) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in A /trunk/powerpc64 A /trunk/powerpc64/Makefile.am A /trunk/powerpc64/Makefile.dev A /trunk/powerpc64/Makefile.in A /trunk/powerpc64/README A /trunk/powerpc64/bench.c A /trunk/powerpc64/generate_all A /trunk/powerpc64/mulredc.m4 A /trunk/powerpc64/mulredc1.asm A /trunk/powerpc64/mulredc10.asm A /trunk/powerpc64/mulredc11.asm A /trunk/powerpc64/mulredc12.asm A /trunk/powerpc64/mulredc13.asm A /trunk/powerpc64/mulredc14.asm A /trunk/powerpc64/mulredc15.asm A /trunk/powerpc64/mulredc16.asm A /trunk/powerpc64/mulredc17.asm A /trunk/powerpc64/mulredc18.asm A /trunk/powerpc64/mulredc19.asm A /trunk/powerpc64/mulredc2.asm A /trunk/powerpc64/mulredc20.asm A /trunk/powerpc64/mulredc3.asm A /trunk/powerpc64/mulredc4.asm A /trunk/powerpc64/mulredc5.asm A /trunk/powerpc64/mulredc6.asm A /trunk/powerpc64/mulredc7.asm A /trunk/powerpc64/mulredc8.asm A /trunk/powerpc64/mulredc9.asm A /trunk/powerpc64/mulredc_1_2.m4 A /trunk/powerpc64/redc.asm A /trunk/powerpc64/test_mulredc.c incorporated asm redc code for powerpc64 from Philip McLaughlin (still to be tested) ------------------------------------------------------------------------ r1332 | kruppa | 2009-01-18 16:35:23 +0100 (Sun, 18 Jan 2009) | 2 lines Changed paths: M /trunk/x86_64/mulredc.m4 Replaced xorq by xorl, added an assert. Cosmetic change, mostly ------------------------------------------------------------------------ r1331 | kruppa | 2009-01-18 16:33:38 +0100 (Sun, 18 Jan 2009) | 2 lines Changed paths: M /trunk/techdocs/mulrecip.tex Small fixes, cleanups ------------------------------------------------------------------------ r1330 | kruppa | 2009-01-18 16:32:10 +0100 (Sun, 18 Jan 2009) | 2 lines Changed paths: M /trunk/rho.gp Small cleanups in comments ------------------------------------------------------------------------ r1329 | kruppa | 2009-01-16 15:35:39 +0100 (Fri, 16 Jan 2009) | 3 lines Changed paths: M /trunk/mpzspm.c Use outputf() instead of printf() for error messages. Print some timing in mpzspm_init() with DEVVERBOSE. ------------------------------------------------------------------------ r1328 | kruppa | 2009-01-16 15:02:57 +0100 (Fri, 16 Jan 2009) | 3 lines Changed paths: M /trunk/rho.c pmeprob() should not access "go" if it might be a NULL pointer. Added function for P-1 probability for factors in a known residue class. ------------------------------------------------------------------------ r1327 | zimmerma | 2009-01-07 12:41:32 +0100 (Wed, 07 Jan 2009) | 5 lines Changed paths: M /trunk/ecm.c M /trunk/ecm.h M /trunk/factor.c M /trunk/main.c Added patch from Philip McLaughlin which adds new option -nobase2s2 that disables base-2 arithmetic in Step 2. This is experimental, and might be removed or replaced by another mechanism later on; in particular, it seems if breaks the binary compatibility. ------------------------------------------------------------------------ r1326 | zimmerma | 2009-01-07 11:28:03 +0100 (Wed, 07 Jan 2009) | 5 lines Changed paths: M /trunk/ecm.c M /trunk/mpmod.c [mpmod.c] added code to compare base-2 arithmetic to default one (disabled for now) [ecm.c] isbase2() was called twice -> called only once now, should yield a small speedup, especially for small B1 ------------------------------------------------------------------------ r1325 | zimmerma | 2008-12-19 16:01:05 +0100 (Fri, 19 Dec 2008) | 2 lines Changed paths: D /trunk/m4/libtool.m4 D /trunk/m4/ltoptions.m4 D /trunk/m4/ltsugar.m4 D /trunk/m4/ltversion.m4 D /trunk/m4/lt~obsolete.m4 removed m4/* files ------------------------------------------------------------------------ r1324 | zimmerma | 2008-12-14 15:05:29 +0100 (Sun, 14 Dec 2008) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h A /trunk/ecmfactor2.c M /trunk/main.c M /trunk/pm1.c M /trunk/pp1.c (unfinished) change to enable the use of GMP-ECM stage 2 from HECM, where a curve is given in Weierstrass form (see example in ecmfactor2.c). It compiles, but remains to be debugged... ------------------------------------------------------------------------ r1323 | zimmerma | 2008-12-14 13:45:34 +0100 (Sun, 14 Dec 2008) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in A /trunk/m4 A /trunk/m4/libtool.m4 A /trunk/m4/ltoptions.m4 A /trunk/m4/ltsugar.m4 A /trunk/m4/ltversion.m4 A /trunk/m4/lt~obsolete.m4 [configure.in,Makefile.am] switch to automake >= 1.10, and added macros suggested by autoreconf -i ------------------------------------------------------------------------ r1322 | brian_gladman | 2008-11-24 10:46:01 +0100 (Mon, 24 Nov 2008) | 1 line Changed paths: M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/readme.txt Update VC++ build project to assume that the GMP root directory is named 'GMP' not 'GMP-version' ------------------------------------------------------------------------ r1321 | brian_gladman | 2008-11-24 10:21:23 +0100 (Mon, 24 Nov 2008) | 1 line Changed paths: M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/libecm/libecm.vcproj Update MSVC builds to use GMP-4.2.4 ------------------------------------------------------------------------ r1320 | zimmerma | 2008-10-12 14:13:23 +0200 (Sun, 12 Oct 2008) | 2 lines Changed paths: M /trunk/configure.in [configure.in] changed version to 6.2.2 ------------------------------------------------------------------------ r1319 | brian_gladman | 2008-09-02 18:15:48 +0200 (Tue, 02 Sep 2008) | 1 line Changed paths: M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/libecm/libecm.vcproj correction to VC++ build project to set the GMP include directory to gmp-4.2.3 ------------------------------------------------------------------------ r1318 | brian_gladman | 2008-08-30 22:03:27 +0200 (Sat, 30 Aug 2008) | 2 lines Changed paths: M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/ecm-params.h M /trunk/build.vc9/readme.txt Update the Visual Studio build to use GMP-4.2.3 ------------------------------------------------------------------------ r1317 | zimmerma | 2008-07-17 11:28:30 +0200 (Thu, 17 Jul 2008) | 2 lines Changed paths: M /trunk/configure.in fixed split infinitive (thanks Paul Leyland) ------------------------------------------------------------------------ r1316 | zimmerma | 2008-06-13 20:50:28 +0200 (Fri, 13 Jun 2008) | 2 lines Changed paths: M /trunk/AUTHORS updated Dave's address ------------------------------------------------------------------------ r1315 | zimmerma | 2008-06-13 02:17:41 +0200 (Fri, 13 Jun 2008) | 2 lines Changed paths: M /trunk/TODO added new item ------------------------------------------------------------------------ r1314 | zimmerma | 2008-06-12 23:53:56 +0200 (Thu, 12 Jun 2008) | 2 lines Changed paths: M /trunk/TODO added comment ------------------------------------------------------------------------ r1313 | zimmerma | 2008-06-12 23:26:27 +0200 (Thu, 12 Jun 2008) | 2 lines Changed paths: M /trunk/listz.c compile list_mul_low only if KS_MULTIPLY is nto defined, to avoid a warning ------------------------------------------------------------------------ r1312 | kruppa | 2008-06-12 02:38:03 +0200 (Thu, 12 Jun 2008) | 2 lines Changed paths: M /trunk/pm1.c Bugfix: new P-1 stage 2 called pm1prob() with uninitialised value for S ------------------------------------------------------------------------ r1311 | zimmerma | 2008-06-09 09:56:58 +0200 (Mon, 09 Jun 2008) | 2 lines Changed paths: M /trunk/configure.in removed --with-gmp-build option (no longer needed as we don't need gmp-impl.h) ------------------------------------------------------------------------ r1310 | zimmerma | 2008-06-06 05:49:50 +0200 (Fri, 06 Jun 2008) | 2 lines Changed paths: M /trunk/TODO added two items ------------------------------------------------------------------------ r1309 | zimmerma | 2008-06-05 03:50:16 +0200 (Thu, 05 Jun 2008) | 3 lines Changed paths: M /trunk/README.dev M /trunk/mul_fft.c added tags corresponding to 6.2 and 6.2.1 in README.dev commented out unused function in mul_fft.c ------------------------------------------------------------------------ r1307 | kruppa | 2008-06-04 05:43:25 +0200 (Wed, 04 Jun 2008) | 2 lines Changed paths: M /trunk/ChangeLog M /trunk/NEWS M /trunk/build.vc9/config.h Set version to 6.2.1 in trunk/build.vc9/config.h, updated NEWS, ChangeLog ------------------------------------------------------------------------ r1306 | kruppa | 2008-06-04 01:53:32 +0200 (Wed, 04 Jun 2008) | 2 lines Changed paths: M /trunk/countsmooth.c Make countsmooth compile again ------------------------------------------------------------------------ r1305 | brian_gladman | 2008-05-30 17:11:07 +0200 (Fri, 30 May 2008) | 1 line Changed paths: M /trunk/build.vc9/config.h M /trunk/build.vc9/ecm/ecm.vcproj M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/tests.py Revert Windows build to use GMP-4.2.1 ------------------------------------------------------------------------ r1304 | kruppa | 2008-05-28 16:56:05 +0200 (Wed, 28 May 2008) | 2 lines Changed paths: M /trunk/pm1.c M /trunk/rho.c Print message about -go with P-1 probabilities ------------------------------------------------------------------------ r1303 | kruppa | 2008-05-28 16:33:34 +0200 (Wed, 28 May 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Print stage 2 time if factor is found ------------------------------------------------------------------------ r1302 | kruppa | 2008-05-28 16:09:12 +0200 (Wed, 28 May 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/rho.c Print probability of finding factors for P-1 ------------------------------------------------------------------------ r1301 | zimmerma | 2008-05-28 10:56:30 +0200 (Wed, 28 May 2008) | 4 lines Changed paths: M /trunk/main.c added warning for -go n1 -go n2 -> only n2 is taken into account (we could modify the code to take both into account, but we also can write -go "n1*n2") ------------------------------------------------------------------------ r1300 | zimmerma | 2008-05-28 09:20:57 +0200 (Wed, 28 May 2008) | 2 lines Changed paths: M /trunk/README.dev M /trunk/configure.in changed the version to 6.2.1, and added hints in README.dev ------------------------------------------------------------------------ r1298 | zimmerma | 2008-05-27 23:04:26 +0200 (Tue, 27 May 2008) | 2 lines Changed paths: M /trunk/sp.h fixed compilation problem on IA65, EV56, ARM ------------------------------------------------------------------------ r1297 | kruppa | 2008-05-27 17:52:50 +0200 (Tue, 27 May 2008) | 2 lines Changed paths: M /trunk/ecm.c M /trunk/main.c M /trunk/resume.c Print success probabilities only if B1 == B2min ------------------------------------------------------------------------ r1296 | kruppa | 2008-05-27 17:09:21 +0200 (Tue, 27 May 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pp1.c Increased default B2 for new P-/+1 stage 2 ------------------------------------------------------------------------ r1295 | jasonp | 2008-05-26 18:33:19 +0200 (Mon, 26 May 2008) | 1 line Changed paths: M /trunk/tune.c fix to previous commit ------------------------------------------------------------------------ r1294 | jasonp | 2008-05-26 17:46:51 +0200 (Mon, 26 May 2008) | 1 line Changed paths: M /trunk/ecm-params.h.alpha-ev5 M /trunk/ecm-params.h.alpha-ev6 M /trunk/ecm-params.h.athlon M /trunk/ecm-params.h.athlon64 M /trunk/ecm-params.h.core2 M /trunk/ecm-params.h.default M /trunk/ecm-params.h.pentium-m M /trunk/ecm-params.h.pentium3 M /trunk/ecm-params.h.pentium4 M /trunk/ecm-params.h.powerpc7450 M /trunk/ntt_gfp.c M /trunk/sp.h M /trunk/spm.c M /trunk/tune.c allow tuning of the breakover point between recursive and iterative NTTs ------------------------------------------------------------------------ r1293 | kruppa | 2008-05-25 15:22:55 +0200 (Sun, 25 May 2008) | 3 lines Changed paths: A /trunk/rho.gp Pari/GP file for estimating ECM probability of success, GMP-ECM's rho.c is a port of this file. ------------------------------------------------------------------------ r1292 | brian_gladman | 2008-05-16 18:22:41 +0200 (Fri, 16 May 2008) | 1 line Changed paths: M /trunk/build.vc9/config.h set version to 6.2 for MSVC ------------------------------------------------------------------------ r1291 | zimmerma | 2008-05-16 17:39:54 +0200 (Fri, 16 May 2008) | 2 lines Changed paths: M /trunk/mpzspv.c removed useless comment (and comment on comment) ------------------------------------------------------------------------ r1290 | kruppa | 2008-05-16 16:43:57 +0200 (Fri, 16 May 2008) | 3 lines Changed paths: M /trunk/Makefile.am A /trunk/ecm-params.h.pentium-m D /trunk/ecm-params.h.pentiumm Renamed parameter file from pentiumm to pentium-m, as that is what GMP uses for the architecture name. gcc uses parameter "-march pentium-m", too ------------------------------------------------------------------------ r1289 | kruppa | 2008-05-16 14:41:48 +0200 (Fri, 16 May 2008) | 2 lines Changed paths: M /trunk/ChangeLog Added latest changes ------------------------------------------------------------------------ r1288 | kruppa | 2008-05-16 14:38:03 +0200 (Fri, 16 May 2008) | 2 lines Changed paths: M /trunk/configure.in Set version to 6.2, set assertions to off by default ------------------------------------------------------------------------ r1287 | kruppa | 2008-05-16 14:27:26 +0200 (Fri, 16 May 2008) | 2 lines Changed paths: M /trunk/NEWS Added item: bugfix of Lucas chains for primes close to 3^32 ------------------------------------------------------------------------ r1286 | kruppa | 2008-05-16 14:14:06 +0200 (Fri, 16 May 2008) | 2 lines Changed paths: M /trunk/TODO Extended note telling why B2min 30 limbs ------------------------------------------------------------------------ r1277 | brian_gladman | 2008-05-13 21:24:37 +0200 (Tue, 13 May 2008) | 1 line Changed paths: M /trunk/build.vc9/tests.py ------------------------------------------------------------------------ r1276 | kruppa | 2008-05-12 01:16:36 +0200 (Mon, 12 May 2008) | 2 lines Changed paths: M /trunk/ecm.c M /trunk/lucas.c Fixed integer overflows in PRAC ------------------------------------------------------------------------ r1275 | kruppa | 2008-05-12 01:15:36 +0200 (Mon, 12 May 2008) | 3 lines Changed paths: M /trunk/makesmooth.gp Make it work better with larger stage 1 primes by using nextprime() instead of forprime() ------------------------------------------------------------------------ r1274 | kruppa | 2008-05-11 21:42:05 +0200 (Sun, 11 May 2008) | 2 lines Changed paths: M /trunk/test.pp1 Added test case for bug in PRAC code for P+1 ------------------------------------------------------------------------ r1273 | kruppa | 2008-05-09 18:12:22 +0200 (Fri, 09 May 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Parameter selection adjusted for P+1 which prefers smaller s_2 ------------------------------------------------------------------------ r1272 | brian_gladman | 2008-05-06 15:20:29 +0200 (Tue, 06 May 2008) | 1 line Changed paths: M /trunk/sp.h improved MSVC assembler code ------------------------------------------------------------------------ r1271 | zimmerma | 2008-05-06 14:31:31 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c replaced s_2 = ... by k = s_2 = ... (continued) ------------------------------------------------------------------------ r1269 | kruppa | 2008-05-06 14:19:27 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/ChangeLog Added lastest changes ------------------------------------------------------------------------ r1268 | kruppa | 2008-05-06 14:17:43 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/pm1.c M /trunk/pp1.c Added "k = s_2 =" to -v output for new stage 2 ------------------------------------------------------------------------ r1266 | kruppa | 2008-05-06 13:59:54 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/ChangeLog Added lastest changes ------------------------------------------------------------------------ r1265 | kruppa | 2008-05-06 13:58:23 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/README.dev M /trunk/configure.in Switched assertions to on by default (change to off for official release) ------------------------------------------------------------------------ r1264 | kruppa | 2008-05-06 13:45:04 +0200 (Tue, 06 May 2008) | 2 lines Changed paths: M /trunk/README Added remark explaining that s_2 in new stage 2 is similar to k in old one ------------------------------------------------------------------------ r1263 | kruppa | 2008-05-05 19:20:59 +0200 (Mon, 05 May 2008) | 4 lines Changed paths: M /trunk/TODO Added Torbjorn Granlund's suggestion for faster mpn_mod_1() Added item on rewriting mpmod.c to use mpn_*, not mpz_* (long term goal) Removed item on dynamic library, mostly done ------------------------------------------------------------------------ r1262 | kruppa | 2008-05-05 18:28:41 +0200 (Mon, 05 May 2008) | 2 lines Changed paths: M /trunk/README Added remark about much improved performance in 64 bit mode ------------------------------------------------------------------------ r1261 | kruppa | 2008-05-05 18:10:05 +0200 (Mon, 05 May 2008) | 3 lines Changed paths: M /trunk/sp.h Added sp_add() C code for modulus with MSB=0, fixed comments for sp_add() asm macro ------------------------------------------------------------------------ r1260 | kruppa | 2008-05-05 17:26:30 +0200 (Mon, 05 May 2008) | 3 lines Changed paths: M /trunk/pm1fs2.c Mostly rewrote parameter selection to minimize estimated cost, allow smaller increments of B2 ------------------------------------------------------------------------ r1259 | zimmerma | 2008-05-03 00:19:56 +0200 (Sat, 03 May 2008) | 2 lines Changed paths: M /trunk/README.dev we should look in TODO for a new release ------------------------------------------------------------------------ r1258 | zimmerma | 2008-05-03 00:18:44 +0200 (Sat, 03 May 2008) | 2 lines Changed paths: M /trunk/TODO added bug with GWNUM interface ------------------------------------------------------------------------ r1257 | kruppa | 2008-05-02 23:52:10 +0200 (Fri, 02 May 2008) | 7 lines Changed paths: M /trunk/configure.in M /trunk/ecm-gmp.h M /trunk/ecm-impl.h M /trunk/ks-multiply.c M /trunk/mpmod.c M /trunk/mpzspv.c M /trunk/mul_fft.c M /trunk/pm1fs2.c M /trunk/schoen_strass.c Fixed broken help string for --enable-sse2 in configure Mangled names of mpn_fft_best_k() and mpn_fft_next_size(), moved prototypes of mpn_fft_*() functions to ecm-impl.h Declared some functions static that were used only locally and polluted the namespace ------------------------------------------------------------------------ r1256 | brian_gladman | 2008-05-02 21:42:15 +0200 (Fri, 02 May 2008) | 3 lines Changed paths: M /trunk/build.vc9/config.h M /trunk/build.vc9/ecm-params.h M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/longlong.h add add MSVC intrinsics to longlong.h update build files ------------------------------------------------------------------------ r1255 | brian_gladman | 2008-05-01 23:52:03 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/tests.py ------------------------------------------------------------------------ r1254 | brian_gladman | 2008-05-01 23:45:32 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/config.h M /trunk/build.vc9/tests.py update version (noticed by Paul) ------------------------------------------------------------------------ r1253 | brian_gladman | 2008-05-01 21:43:33 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/readme.txt ------------------------------------------------------------------------ r1252 | zimmerma | 2008-05-01 21:41:43 +0200 (Thu, 01 May 2008) | 3 lines Changed paths: M /trunk/INSTALL M /trunk/configure.in INSTALL: added pointer to build.vc9/readme.txt for Windows/VC++ configure.in: changed version to 6.2-rc2 ------------------------------------------------------------------------ r1251 | brian_gladman | 2008-05-01 21:36:11 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/readme.txt further update to the VC++ readme file. ------------------------------------------------------------------------ r1250 | brian_gladman | 2008-05-01 20:42:25 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/config.h M /trunk/build.vc9/readme.txt minor non critical changes to readme.txt and config.h ------------------------------------------------------------------------ r1249 | brian_gladman | 2008-05-01 19:20:31 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/tests.py ------------------------------------------------------------------------ r1248 | brian_gladman | 2008-05-01 11:38:19 +0200 (Thu, 01 May 2008) | 1 line Changed paths: M /trunk/build.vc9/readme.txt M /trunk/build.vc9/tests.py ------------------------------------------------------------------------ r1247 | zimmerma | 2008-05-01 01:05:46 +0200 (Thu, 01 May 2008) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in changes to have Brian's VC build files included in "make dist" ------------------------------------------------------------------------ r1246 | zimmerma | 2008-05-01 01:04:51 +0200 (Thu, 01 May 2008) | 2 lines Changed paths: A /trunk/build.vc9/Makefile.am A /trunk/build.vc9/assembler/Makefile.am A /trunk/build.vc9/ecm/Makefile.am A /trunk/build.vc9/libecm/Makefile.am needed makefiles to have the VC build files included in "make dist" ------------------------------------------------------------------------ r1245 | kruppa | 2008-04-30 17:34:22 +0200 (Wed, 30 Apr 2008) | 2 lines Changed paths: M /trunk/configure.in Use -W compiler flag instead of -Wextra so it works with older gcc version ------------------------------------------------------------------------ r1241 | kruppa | 2008-04-29 19:11:41 +0200 (Tue, 29 Apr 2008) | 2 lines Changed paths: M /trunk/ChangeLog Added the most recent changes ------------------------------------------------------------------------ r1240 | kruppa | 2008-04-29 16:16:58 +0200 (Tue, 29 Apr 2008) | 2 lines Changed paths: M /trunk/mul_fft.c More compiler warnings fixed... ------------------------------------------------------------------------ r1239 | zimmerma | 2008-04-29 16:06:38 +0200 (Tue, 29 Apr 2008) | 2 lines Changed paths: M /trunk/NEWS M /trunk/ecm.h put ecm.h under LGPL (as it should have been from the beginning...) ------------------------------------------------------------------------ r1238 | kruppa | 2008-04-29 15:36:30 +0200 (Tue, 29 Apr 2008) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/longlong.h M /trunk/mul_fft.c Fixed some more compiler warnings ------------------------------------------------------------------------ r1237 | kruppa | 2008-04-29 15:14:20 +0200 (Tue, 29 Apr 2008) | 4 lines Changed paths: M /trunk/configure.in Probing for compiler warning flags was unreliable: with Sun CC, -pedantic succeeds in the test, but fails during compiling. Adding warning flags only if we use GCC now ------------------------------------------------------------------------ r1236 | kruppa | 2008-04-29 14:01:38 +0200 (Tue, 29 Apr 2008) | 7 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm_ntt.c M /trunk/eval.c M /trunk/main.c M /trunk/pm1fs2.c M /trunk/polyeval.c M /trunk/pp1.c M /trunk/random.c M /trunk/schoen_strass.c M /trunk/sp.h M /trunk/stage2.c Changed some "#if HAVE_*" to "#ifdef HAVE_*" to avoid warnings with -Wundef This implies that "#define HAVE_FOO 0" makes the ifdef succeed which is counter-intuitive. A test that can properly distinguish macros that are undefined, defined to empty token, defined to 0 or defined to 1 seems to require token concatenation and two-level expansion which is horrible. ------------------------------------------------------------------------ r1235 | kruppa | 2008-04-28 20:38:54 +0200 (Mon, 28 Apr 2008) | 2 lines Changed paths: M /trunk/ecm.1 Man page was out of date, re-made from ecm.xml ------------------------------------------------------------------------ r1234 | kruppa | 2008-04-28 20:36:32 +0200 (Mon, 28 Apr 2008) | 5 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in Fixed bug, test for matching version in gmp.h and libgmp failed due to missing include path if there was no gmp.h in a default include directory. Significant rewrite that hopefully is both more correct and cleaner. Added tests for compiler warning flags. ------------------------------------------------------------------------ r1232 | kruppa | 2008-04-28 16:58:54 +0200 (Mon, 28 Apr 2008) | 2 lines Changed paths: M /trunk/ChangeLog Added recent changes to ChangeLog ------------------------------------------------------------------------ r1231 | kruppa | 2008-04-28 16:38:58 +0200 (Mon, 28 Apr 2008) | 2 lines Changed paths: M /trunk/INSTALL Some updates for 6.2rc1 ------------------------------------------------------------------------ r1230 | kruppa | 2008-04-28 16:24:21 +0200 (Mon, 28 Apr 2008) | 3 lines Changed paths: M /trunk/Makefile.am Added .s and .S files to CLEANFILES, as they did not get cleaned up by default ------------------------------------------------------------------------ r1229 | kruppa | 2008-04-28 12:14:50 +0200 (Mon, 28 Apr 2008) | 2 lines Changed paths: M /trunk/test.ecm M /trunk/test.pm1 M /trunk/test.pp1 Added a test in each file with an input too large for mulredc*() ------------------------------------------------------------------------ r1228 | kruppa | 2008-04-28 01:05:06 +0200 (Mon, 28 Apr 2008) | 3 lines Changed paths: M /trunk/configure.in Fixed problem with ./configure script not cunning correctly if --with-gmp was given and --enable-shared was not. ------------------------------------------------------------------------ r1227 | zimmerma | 2008-04-27 13:28:14 +0200 (Sun, 27 Apr 2008) | 2 lines Changed paths: M /trunk/configure.in fixed warning "AC_CANONICAL_HOST invoked multiple times" ------------------------------------------------------------------------ r1226 | zimmerma | 2008-04-27 10:30:00 +0200 (Sun, 27 Apr 2008) | 2 lines Changed paths: M /trunk/README.dev we also need to check INSTALL for a new release ------------------------------------------------------------------------ r1225 | zimmerma | 2008-04-27 10:28:35 +0200 (Sun, 27 Apr 2008) | 2 lines Changed paths: M /trunk/INSTALL update GMP latest version ------------------------------------------------------------------------ r1224 | kruppa | 2008-04-25 17:55:01 +0200 (Fri, 25 Apr 2008) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in Trying to make --enable-shared and --with-gmp work together ------------------------------------------------------------------------ r1223 | kruppa | 2008-04-25 14:59:34 +0200 (Fri, 25 Apr 2008) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in M /trunk/x86_64/redc.asm Fixed problem with compiling x86_64 assembly routines for dynamic library, a computed jump referenced absolute address of label in .text. Now we compute target address relative to rip ------------------------------------------------------------------------ r1222 | kruppa | 2008-04-25 11:44:59 +0200 (Fri, 25 Apr 2008) | 3 lines Changed paths: M /trunk/pm1.c M /trunk/pp1.c Fixed implicit conversion of int constant to double which caused warning in Visual C ------------------------------------------------------------------------ r1221 | kruppa | 2008-04-25 11:32:24 +0200 (Fri, 25 Apr 2008) | 3 lines Changed paths: M /trunk/makesmooth.gp Added function to produce primes where a given value is a quadratic non-residue, to make testing P+1 easier ------------------------------------------------------------------------ r1220 | kruppa | 2008-04-24 17:57:18 +0200 (Thu, 24 Apr 2008) | 2 lines Changed paths: M /trunk/acinclude.m4 A missing "-c" flag in test compilation of .s files caused configure to fail ------------------------------------------------------------------------ r1219 | lfousse | 2008-04-24 15:37:07 +0200 (Thu, 24 Apr 2008) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in Produce a shared library with libtool. ------------------------------------------------------------------------ r1218 | brian_gladman | 2008-04-20 17:09:00 +0200 (Sun, 20 Apr 2008) | 1 line Changed paths: A /trunk/build.vc9/assembler/a_win32a_mulredc.asm A /trunk/build.vc9/assembler/a_win32a_redc.asm M /trunk/build.vc9/assembler/a_win32p_mulredc.asm M /trunk/build.vc9/assembler/a_win32p_redc.asm M /trunk/build.vc9/assembler/a_x64_mulredc.asm M /trunk/build.vc9/libecm/libecm.vcproj add win32 athlon assembler support for Visual C build ------------------------------------------------------------------------ r1217 | brian_gladman | 2008-04-18 12:33:20 +0200 (Fri, 18 Apr 2008) | 1 line Changed paths: A /trunk/build.vc9/assembler/a_win32p_mulredc.asm A /trunk/build.vc9/assembler/a_win32p_redc.asm M /trunk/build.vc9/assembler/a_x64_mulredc.asm M /trunk/build.vc9/config.h M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/tests.py Add 32-bit pentium assembler support for VC++ build ------------------------------------------------------------------------ r1216 | kruppa | 2008-04-17 15:25:51 +0200 (Thu, 17 Apr 2008) | 2 lines Changed paths: M /trunk/README Updated to NTT and SchönhageStrassen sections ------------------------------------------------------------------------ r1215 | kruppa | 2008-04-17 11:30:23 +0200 (Thu, 17 Apr 2008) | 2 lines Changed paths: M /trunk/mpzspv.c Marked floating-point constant "float" to match other operands ------------------------------------------------------------------------ r1214 | brian_gladman | 2008-04-16 15:17:10 +0200 (Wed, 16 Apr 2008) | 1 line Changed paths: A /trunk/build.vc9/readme.txt A /trunk/build.vc9/yasm.rules Add short description of how to use YASM with VC++ for assembler support and mention Python tests file ------------------------------------------------------------------------ r1213 | brian_gladman | 2008-04-16 13:58:44 +0200 (Wed, 16 Apr 2008) | 1 line Changed paths: M /trunk/build.vc9/assembler/a_x64_mulredc.asm M /trunk/build.vc9/config.h M /trunk/build.vc9/libecm/libecm.vcproj M /trunk/build.vc9/tests.py enable assembler build with VC++ ------------------------------------------------------------------------ r1212 | brian_gladman | 2008-04-16 12:32:11 +0200 (Wed, 16 Apr 2008) | 1 line Changed paths: M /trunk/build.vc9/assembler/a_x64_mulredc.asm M /trunk/build.vc9/config.h M /trunk/build.vc9/libecm/libecm.vcproj ------------------------------------------------------------------------ r1211 | brian_gladman | 2008-04-16 11:56:01 +0200 (Wed, 16 Apr 2008) | 1 line Changed paths: A /trunk/build.vc9/assembler A /trunk/build.vc9/assembler/a_x64_mulredc.asm A /trunk/build.vc9/assembler/a_x64_redc.asm A /trunk/build.vc9/assembler/test_mulredc.c M /trunk/build.vc9/config.h A /trunk/build.vc9/tests.py Added YASM assembler code for AMD64 VC++ build ------------------------------------------------------------------------ r1210 | zimmerma | 2008-04-15 18:22:55 +0200 (Tue, 15 Apr 2008) | 2 lines Changed paths: M /trunk/README changed comment about efficiency of NTT to match new code ------------------------------------------------------------------------ r1209 | kruppa | 2008-04-15 17:25:10 +0200 (Tue, 15 Apr 2008) | 3 lines Changed paths: M /trunk/pm1fs2.c M /trunk/sets_long.c Disabled P > 2^30 in 32 bit machines as they lead to overflow in integer arithmetic in sets_long.c. This limits B2-B2min to about 10^15. ------------------------------------------------------------------------ r1208 | kruppa | 2008-04-15 16:36:06 +0200 (Tue, 15 Apr 2008) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1fs2.c M /trunk/sets_long.c There was an integer overflow problem in sets_sumset_minmax() and maxS() on 32 bit machines. Changed arithmetic to use GMP for these. ------------------------------------------------------------------------ r1207 | kruppa | 2008-04-15 14:40:15 +0200 (Tue, 15 Apr 2008) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pp1.c Fixed overflow when converting maxmem from double -> size_t ------------------------------------------------------------------------ r1206 | kruppa | 2008-04-15 14:25:32 +0200 (Tue, 15 Apr 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Print elapsed real time for stage 2 in verbose mode with multi-threading ------------------------------------------------------------------------ r1205 | kruppa | 2008-04-15 11:03:21 +0200 (Tue, 15 Apr 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed small output inconsistency in devverbose mode ------------------------------------------------------------------------ r1204 | kruppa | 2008-04-14 18:46:32 +0200 (Mon, 14 Apr 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Some changes to avoid integer overflow in memory estimation ------------------------------------------------------------------------ r1203 | kruppa | 2008-04-14 16:40:01 +0200 (Mon, 14 Apr 2008) | 3 lines Changed paths: M /trunk/mpzspm.c mpzspm_init() could miss a prime just below SP_MAX, reducing possible input size for a given transform length. ------------------------------------------------------------------------ r1202 | kruppa | 2008-04-14 16:38:42 +0200 (Mon, 14 Apr 2008) | 3 lines Changed paths: M /trunk/README Mention OMP_NUM_THREADS for OpenMP, explicit limits for input size and transform length on 32 bit machines. ------------------------------------------------------------------------ r1201 | kruppa | 2008-04-14 15:40:52 +0200 (Mon, 14 Apr 2008) | 4 lines Changed paths: M /trunk/auxlib.c Print number to be factored in decimal in checkpoint files. Conversion to decimal used to be slow, but is fast enough in recent GMP releases that this is not an issue any more ------------------------------------------------------------------------ r1200 | kruppa | 2008-04-14 14:14:19 +0200 (Mon, 14 Apr 2008) | 3 lines Changed paths: M /trunk/pm1.c Made code more linear to avoid having two if() braches with almost identical code. ------------------------------------------------------------------------ r1199 | kruppa | 2008-04-13 23:32:31 +0200 (Sun, 13 Apr 2008) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/pm1.c M /trunk/pp1.c Put code to print "Using B1=..." line in function rather than having 3 almost identical copies in pm1.c, pp1.c and ecm.c. ------------------------------------------------------------------------ r1198 | zimmerma | 2008-04-13 10:48:25 +0200 (Sun, 13 Apr 2008) | 2 lines Changed paths: M /trunk/ecm.c removed trailing blank ------------------------------------------------------------------------ r1197 | brian_gladman | 2008-04-12 17:09:39 +0200 (Sat, 12 Apr 2008) | 1 line Changed paths: M /trunk/README.dev M /trunk/build.vc9/config.h Update VC++ build to match configure.in version number ------------------------------------------------------------------------ r1196 | zimmerma | 2008-04-12 14:25:54 +0200 (Sat, 12 Apr 2008) | 2 lines Changed paths: M /trunk/AUTHORS added Brian in author list ------------------------------------------------------------------------ r1195 | zimmerma | 2008-04-12 14:23:54 +0200 (Sat, 12 Apr 2008) | 6 lines Changed paths: M /trunk/ChangeLog M /trunk/NEWS M /trunk/README M /trunk/configure.in NEWS: added some important items configure.in: changed version to 6.2-rc1 ChangeLog: added missing entries since 6.1 README: changes after complete pass (several issues remain to be solved by Jason and/or Alex) ------------------------------------------------------------------------ r1194 | brian_gladman | 2008-04-11 23:06:30 +0200 (Fri, 11 Apr 2008) | 1 line Changed paths: A /trunk/build.vc9 A /trunk/build.vc9/config.h A /trunk/build.vc9/ecm A /trunk/build.vc9/ecm/ecm.vcproj A /trunk/build.vc9/ecm-params.h A /trunk/build.vc9/ecm.sln A /trunk/build.vc9/file_copy.bat A /trunk/build.vc9/libecm A /trunk/build.vc9/libecm/libecm.vcproj ------------------------------------------------------------------------ r1193 | kruppa | 2008-04-11 15:58:37 +0200 (Fri, 11 Apr 2008) | 3 lines Changed paths: M /trunk/README Some updates for new release, mention new stage 2 and that it doesn't work with Brent-Suyama ------------------------------------------------------------------------ r1192 | kruppa | 2008-04-11 14:26:11 +0200 (Fri, 11 Apr 2008) | 2 lines Changed paths: M /trunk/ecm.1 M /trunk/ecm.xml Updated man page ------------------------------------------------------------------------ r1191 | kruppa | 2008-04-10 13:47:40 +0200 (Thu, 10 Apr 2008) | 2 lines Changed paths: M /trunk/sp.h Fixed compiler warning about shift-by-32 on 32 bit systems. ------------------------------------------------------------------------ r1190 | zimmerma | 2008-04-10 08:38:03 +0200 (Thu, 10 Apr 2008) | 2 lines Changed paths: M /trunk/mul_fft.c pragma was ill-positioned ------------------------------------------------------------------------ r1189 | zimmerma | 2008-04-09 21:01:59 +0200 (Wed, 09 Apr 2008) | 2 lines Changed paths: M /trunk/mul_fft.c incorporated patches for VC++ v9 from Brian Gladman ------------------------------------------------------------------------ r1188 | kruppa | 2008-04-09 18:38:23 +0200 (Wed, 09 Apr 2008) | 4 lines Changed paths: M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/sp.h Added conversion routines sp_t <-> mpz_t to overcome portability issues on systems where sp_t is wider than unsigned long and mpz_*_ui() functions can't be used. ------------------------------------------------------------------------ r1187 | kruppa | 2008-04-09 16:31:36 +0200 (Wed, 09 Apr 2008) | 3 lines Changed paths: M /trunk/ecm.c In verbose mode if a factor was found, memory for table of Dickman rho values was not freed. ------------------------------------------------------------------------ r1186 | kruppa | 2008-04-09 16:17:08 +0200 (Wed, 09 Apr 2008) | 2 lines Changed paths: M /trunk/ecm-ecm.h M /trunk/main.c M /trunk/sets_long.c Changed some #if to #ifdef . Include alloca.h in sets_long.c ------------------------------------------------------------------------ r1185 | kruppa | 2008-04-09 16:15:28 +0200 (Wed, 09 Apr 2008) | 3 lines Changed paths: M /trunk/configure.in Check for setpriority() function (code tested for HAVE_SETPRIORITY, but configure never defined it) ------------------------------------------------------------------------ r1184 | kruppa | 2008-04-09 16:13:10 +0200 (Wed, 09 Apr 2008) | 3 lines Changed paths: M /trunk/tune.c Replaced %zd conversion in printf() by %ld with a typecast, z modifier is C99 and probably not very portable. ------------------------------------------------------------------------ r1183 | kruppa | 2008-04-09 15:33:41 +0200 (Wed, 09 Apr 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed compiler warnings, bug (had comparison instead of assignment) ------------------------------------------------------------------------ r1182 | kruppa | 2008-04-09 15:32:10 +0200 (Wed, 09 Apr 2008) | 2 lines Changed paths: M /trunk/sp.h Fixed data type (was unsigned long instead of sp_t), removed unused variables ------------------------------------------------------------------------ r1181 | kruppa | 2008-04-09 15:30:47 +0200 (Wed, 09 Apr 2008) | 3 lines Changed paths: M /trunk/mul_fft.c Fixed some compiler warnings. Removed inclusion of longlong.h, as it expects certain data types for arithmetic on one-word integers to be defined. ------------------------------------------------------------------------ r1180 | kruppa | 2008-04-09 15:28:02 +0200 (Wed, 09 Apr 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h Made parameter types for ntt_*() functions use the typedefs from sp.h ------------------------------------------------------------------------ r1179 | zimmerma | 2008-04-09 14:23:04 +0200 (Wed, 09 Apr 2008) | 4 lines Changed paths: M /trunk/polyeval.c M /trunk/stage2.c removed extra argument of polyeval() in stage2.c polyeval.c: polyeval() and polyeval_tellegen() must always be compiled, since they are needed in tune.c. ------------------------------------------------------------------------ r1178 | kruppa | 2008-04-09 10:40:54 +0200 (Wed, 09 Apr 2008) | 3 lines Changed paths: M /trunk/spm.c Fixed wrong type which broke arithmetic on machines where unsigned long has less width than sp_t. ------------------------------------------------------------------------ r1177 | kruppa | 2008-04-08 18:48:10 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/sp.h Fixed typo in precompiler condition ------------------------------------------------------------------------ r1176 | kruppa | 2008-04-08 17:46:54 +0200 (Tue, 08 Apr 2008) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm2.c M /trunk/pm1.c M /trunk/pp1.c M /trunk/stage2.c Renamed mis-named function. Fixed output of uninitialised "dickson_a" value in ECM stage 2. ------------------------------------------------------------------------ r1175 | kruppa | 2008-04-08 16:51:35 +0200 (Tue, 08 Apr 2008) | 3 lines Changed paths: M /trunk/mul_fft.c Fixed bug in MPN_FFT_STORE: the non-asm version wrote too little data due to bad pointer type. ------------------------------------------------------------------------ r1174 | zimmerma | 2008-04-08 15:55:08 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/mul_fft.c added MPN_ZERO if not defined ------------------------------------------------------------------------ r1173 | kruppa | 2008-04-08 15:02:33 +0200 (Tue, 08 Apr 2008) | 3 lines Changed paths: M /trunk/main.c M /trunk/mul_fft.c M /trunk/sp.h Included some changes suggested by Brian Gladman to allow compiling under MS VC 9. ------------------------------------------------------------------------ r1172 | kruppa | 2008-04-08 13:03:51 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed some "variable sized arrays" ------------------------------------------------------------------------ r1171 | zimmerma | 2008-04-08 12:55:37 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/longlong.h M /trunk/mul_fft.c define dummy versions of __builtin_constant_p and __builtin_expect when not gcc ------------------------------------------------------------------------ r1170 | kruppa | 2008-04-08 12:49:05 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1fs2.c Fixed some compiler warnings with -pedantic ------------------------------------------------------------------------ r1169 | zimmerma | 2008-04-08 12:00:42 +0200 (Tue, 08 Apr 2008) | 2 lines Changed paths: M /trunk/TODO added TODO item ------------------------------------------------------------------------ r1168 | zimmerma | 2008-04-08 11:38:43 +0200 (Tue, 08 Apr 2008) | 5 lines Changed paths: M /trunk/Makefile.am M /trunk/asmredc.h M /trunk/getprime.c M /trunk/mul_fft.c Makefile.am: added -pedantic asmredc.h, mul_fft.c, getprime.c: - changed C++ style comments //... to C style /* ... */ - fixed type declarations inbetween instructions ------------------------------------------------------------------------ r1167 | jasonp | 2008-04-08 04:30:45 +0200 (Tue, 08 Apr 2008) | 1 line Changed paths: M /trunk/sp.h force the size of small prime residues to explicitly match up with a GMP word ------------------------------------------------------------------------ r1166 | zimmerma | 2008-04-03 21:44:25 +0200 (Thu, 03 Apr 2008) | 3 lines Changed paths: M /trunk/ecm.c reduce the number of tried Lucas chains in PRAC for small numbers (thanks to Pierrick who noticed the overhead with MPFQ) ------------------------------------------------------------------------ r1165 | kruppa | 2008-03-25 19:40:32 +0100 (Tue, 25 Mar 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c M /trunk/pp1.c Some unsaved edits that were missing in last commit ------------------------------------------------------------------------ r1164 | kruppa | 2008-03-25 19:33:25 +0100 (Tue, 25 Mar 2008) | 4 lines Changed paths: M /trunk/configure.in M /trunk/ecm-impl.h M /trunk/median.c M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Added code for automatic parameters selection for new P-1 and P+1 stage 2 according to available memory. Report if there's an error (e.g. out-of-memory condition) in TMulKS(). ------------------------------------------------------------------------ r1163 | jasonp | 2008-03-23 00:29:40 +0100 (Sun, 23 Mar 2008) | 4 lines Changed paths: M /trunk/ntt_gfp.c M /trunk/sp.h M /trunk/spv.c Use 31-bit primes for the NTT on 32-bit systems. This is slightly slower than using 30-bit primes but allows arithmetic on larger polynomials ------------------------------------------------------------------------ r1162 | kruppa | 2008-03-21 18:49:16 +0100 (Fri, 21 Mar 2008) | 3 lines Changed paths: M /trunk/mpzspm.c M /trunk/pm1.c M /trunk/pp1.c M /trunk/sp.h New P+1 and P-1 stage 2 now checks maximal supported transform length for NTT, reduces lmax accordingly ------------------------------------------------------------------------ r1161 | kruppa | 2008-03-20 15:39:11 +0100 (Thu, 20 Mar 2008) | 14 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in M /trunk/main.c M /trunk/pm1fs2.c Andreas Schickel reported a problem with new stage 2 on 32 bit: there are so few suitable NTT primes < 2^30 that only short transform lengths (~2^19) are possible. Instead of requiring primes == 1 (mod 3l), code for building F with weighted convolutions is separate now and uses primes == 1 (mod 3l/4), the convolution for multipoint evaluation uses primes == 1 (mod l). This improves the situation a little, but the possible transform lengths (~20^20) are still rather small. Also fixed out of bound array access in ntt_sqr_reciprocal(). Print time stamp for each curve now instead of for each new number, requested by Andreas. ------------------------------------------------------------------------ r1160 | kruppa | 2008-03-20 11:49:17 +0100 (Thu, 20 Mar 2008) | 2 lines Changed paths: M /trunk/mul_fft.c Enable assertions if so specified in config.h (were always off!) ------------------------------------------------------------------------ r1159 | kruppa | 2008-03-20 11:39:56 +0100 (Thu, 20 Mar 2008) | 2 lines Changed paths: M /trunk/mul_fft.c Added two missing ASSERT for NULL pointer after alloc ------------------------------------------------------------------------ r1158 | jasonp | 2008-03-20 04:16:17 +0100 (Thu, 20 Mar 2008) | 1 line Changed paths: M /trunk/ntt_gfp.c M /trunk/sp.h M /trunk/spm.c do not store NTT scratch array on the stack ------------------------------------------------------------------------ r1157 | kruppa | 2008-03-19 19:52:40 +0100 (Wed, 19 Mar 2008) | 5 lines Changed paths: M /trunk/pm1fs2.c Revert bug-"fix" from just before, it breaks the NTT. Test for NULL pointer from mpzspm_init(). Changed diagnostic output during parameter selection to TRACE level. ------------------------------------------------------------------------ r1156 | kruppa | 2008-03-19 19:29:52 +0100 (Wed, 19 Mar 2008) | 3 lines Changed paths: M /trunk/mpzspm.c M /trunk/pm1fs2.c Fixed bug: required transform length overestimated, reduced possible transform length on 32 bit machines ------------------------------------------------------------------------ r1155 | kruppa | 2008-03-19 17:57:20 +0100 (Wed, 19 Mar 2008) | 3 lines Changed paths: M /trunk/Makefile.am Added ecm-params.h.pentium3 and ecm-params.h.pentium4 to list of files to put in distribution. ------------------------------------------------------------------------ r1154 | kruppa | 2008-03-19 16:30:28 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: M /trunk/Makefile.am Added rule to remove config.m4 on distclean, to make distcheck work ------------------------------------------------------------------------ r1153 | zimmerma | 2008-03-19 13:51:39 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: A /trunk/ecm-params.h.pentiumm tuning parameters for pentium M ------------------------------------------------------------------------ r1152 | kruppa | 2008-03-19 12:36:38 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: M /trunk/AUTHORS M /trunk/NEWS Added Jason, fixed typo ------------------------------------------------------------------------ r1151 | kruppa | 2008-03-19 11:46:37 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: A /trunk/ecm-params.h.pentium4 Parameter file for Pentium 4 ------------------------------------------------------------------------ r1150 | kruppa | 2008-03-19 11:45:55 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: M /trunk/ecm-params.h.athlon64 Updated parameters for new NTT ------------------------------------------------------------------------ r1149 | kruppa | 2008-03-19 11:45:40 +0100 (Wed, 19 Mar 2008) | 2 lines Changed paths: M /trunk/NEWS Updated NEWS for 6.2 ------------------------------------------------------------------------ r1148 | kruppa | 2008-03-18 20:18:06 +0100 (Tue, 18 Mar 2008) | 3 lines Changed paths: M /trunk/configure.in Added --enable-sse2 option. Default is using SSE2 on Pentium 4, not using it on other architectures. ------------------------------------------------------------------------ r1147 | kruppa | 2008-03-18 20:16:59 +0100 (Tue, 18 Mar 2008) | 4 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Made -maxmem work with new P+-1 stage 2, also chooses one pass evaluation (computing convolutions for both coordinates, adding in transform space to save an inverse transform) for P+1 if memory allows. ------------------------------------------------------------------------ r1146 | kruppa | 2008-03-14 15:14:15 +0100 (Fri, 14 Mar 2008) | 4 lines Changed paths: M /trunk/sp.h Re-wrote asm macro for sp_add, sp_sub. Fixes (I hope!) a subtle bug in operand constraints (edx could be used as input operand in spite of being marked "clobbered") and I hope is a bit faster, too. ------------------------------------------------------------------------ r1145 | kruppa | 2008-03-13 15:19:32 +0100 (Thu, 13 Mar 2008) | 2 lines Changed paths: M /trunk/configure.in autoconf doesn't like space in macro ------------------------------------------------------------------------ r1144 | kruppa | 2008-03-13 14:55:54 +0100 (Thu, 13 Mar 2008) | 2 lines Changed paths: M /trunk/Fgw.c Thrown out lots of dead code ------------------------------------------------------------------------ r1143 | kruppa | 2008-03-13 14:52:22 +0100 (Thu, 13 Mar 2008) | 4 lines Changed paths: M /trunk/ecm.c M /trunk/lucas.c Replaced floating-point constants for PRAC by their reciprocals and division by multiplication. Some speedup for P+1 with very small numbers, for other cases very little speedup. ------------------------------------------------------------------------ r1142 | kruppa | 2008-02-27 18:45:43 +0100 (Wed, 27 Feb 2008) | 3 lines Changed paths: M /trunk/Fgw.c Added assert to GWNUM ECM stage 1 interface to test that output residue fits in allocated space. ------------------------------------------------------------------------ r1141 | kruppa | 2008-02-27 15:54:00 +0100 (Wed, 27 Feb 2008) | 4 lines Changed paths: M /trunk/Fgw.c M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/mpmod.c M /trunk/schoen_strass.c Removed code to use GWNUM for individual multiplications modulo Fermat numbers - I think it was broken, never got used much anyway and if we really want it, will need a rewrite from scratch ------------------------------------------------------------------------ r1140 | kruppa | 2008-02-27 15:43:21 +0100 (Wed, 27 Feb 2008) | 2 lines Changed paths: M /trunk/ecm.c Removed some remaining dead Montgomery roots code ------------------------------------------------------------------------ r1139 | kruppa | 2008-02-27 12:03:47 +0100 (Wed, 27 Feb 2008) | 3 lines Changed paths: M /trunk/pm1fs2.c Added check of result to ntt_sqr_reciprocal(), hoping to find an elusive bug that appears on my AMD Athlon ------------------------------------------------------------------------ r1138 | kruppa | 2008-02-27 01:36:13 +0100 (Wed, 27 Feb 2008) | 2 lines Changed paths: M /trunk/mpzspv.c Fixed segfault in mpzspv_verify(). ------------------------------------------------------------------------ r1137 | kruppa | 2008-02-26 23:46:13 +0100 (Tue, 26 Feb 2008) | 2 lines Changed paths: M /trunk/x86_64/README Added comment on generating mulredc{1,2}.asm from Python script ------------------------------------------------------------------------ r1136 | kruppa | 2008-02-26 23:42:50 +0100 (Tue, 26 Feb 2008) | 2 lines Changed paths: M /trunk/x86_64/Makefile.am M /trunk/x86_64/Makefile.dev M /trunk/x86_64/generate_all Updated Makefiles/scripts to make mulredc asm code from autogen.py/mulredc.m4 ------------------------------------------------------------------------ r1135 | kruppa | 2008-02-26 23:24:25 +0100 (Tue, 26 Feb 2008) | 2 lines Changed paths: M /trunk/x86_64/mulredc1.asm mulredc1.asm as generated by Python script (without my old comment edits) ------------------------------------------------------------------------ r1134 | kruppa | 2008-02-26 19:59:22 +0100 (Tue, 26 Feb 2008) | 3 lines Changed paths: M /trunk/configure.in M /trunk/ntt_gfp.c M /trunk/spv.c configure now defines HAS_SSE2 if running on Pentium 4 so NTT uses Jason Papadopoulos' SSE2 code. ------------------------------------------------------------------------ r1133 | kruppa | 2008-02-26 19:21:56 +0100 (Tue, 26 Feb 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Parallelized pp1_sequence_h(), last function that hadn't been done yet ------------------------------------------------------------------------ r1132 | kruppa | 2008-02-22 17:53:40 +0100 (Fri, 22 Feb 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c M /trunk/sets_long.c Fixed typo, spelt out ANTS ------------------------------------------------------------------------ r1131 | kruppa | 2008-02-22 17:42:15 +0100 (Fri, 22 Feb 2008) | 4 lines Changed paths: M /trunk/auxlib.c M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/ntt_gfp.c M /trunk/pm1fs2.c M /trunk/sets_long.c M /trunk/sp.c M /trunk/sp.h M /trunk/spm.c M /trunk/spv.c M /trunk/tune.c Included patch by Jason to fix out-of-bounds array access. Updated copyright information. ------------------------------------------------------------------------ r1130 | kruppa | 2008-02-21 19:36:26 +0100 (Thu, 21 Feb 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Removed dead code, made output of timing info more consistent ------------------------------------------------------------------------ r1129 | kruppa | 2008-02-21 17:18:34 +0100 (Thu, 21 Feb 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c Changed _ui funtions to take unsigned longs, as GMP does ------------------------------------------------------------------------ r1128 | kruppa | 2008-02-21 15:07:54 +0100 (Thu, 21 Feb 2008) | 2 lines Changed paths: M /trunk/getprime.c Fixed access to uninitialised data in getprime() ------------------------------------------------------------------------ r1127 | kruppa | 2008-02-21 12:14:35 +0100 (Thu, 21 Feb 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/getprime.c M /trunk/mpmod.c M /trunk/mul_fft.c Fixed some compiler warnings. ------------------------------------------------------------------------ r1126 | kruppa | 2008-02-21 11:42:33 +0100 (Thu, 21 Feb 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm2.c M /trunk/pm1.c M /trunk/pp1.c M /trunk/stage2.c Fixed ugly typecast ------------------------------------------------------------------------ r1125 | kruppa | 2008-02-20 15:34:07 +0100 (Wed, 20 Feb 2008) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm2.c Removed code for ECM stage 2 roots in Montgomery form. Never worked and probably never will. ------------------------------------------------------------------------ r1124 | kruppa | 2008-02-20 15:19:11 +0100 (Wed, 20 Feb 2008) | 2 lines Changed paths: M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/ntt_gfp.c M /trunk/pm1fs2.c M /trunk/sp.c M /trunk/sp.h M /trunk/spm.c M /trunk/spv.c M /trunk/tune.c Merging Jason Papadopoulos' new SSE2 NTT code ------------------------------------------------------------------------ r1123 | kruppa | 2008-02-20 15:18:35 +0100 (Wed, 20 Feb 2008) | 2 lines Changed paths: M /trunk/techdocs/convolv.tex Cleanups ------------------------------------------------------------------------ r1122 | kruppa | 2008-02-20 13:56:45 +0100 (Wed, 20 Feb 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Some small cleanup of timing/residue output ------------------------------------------------------------------------ r1121 | kruppa | 2008-02-19 23:53:22 +0100 (Tue, 19 Feb 2008) | 2 lines Changed paths: M /trunk/stage2.c Small changes (mostly comments) ------------------------------------------------------------------------ r1120 | kruppa | 2008-02-19 19:03:30 +0100 (Tue, 19 Feb 2008) | 2 lines Changed paths: M /trunk/pm1.c Made new stage 2 the default. Fixed small memory leak. ------------------------------------------------------------------------ r1119 | kruppa | 2008-02-19 18:59:20 +0100 (Tue, 19 Feb 2008) | 3 lines Changed paths: M /trunk/pp1.c Fixed bug in computing roots of G when i0 < -6. Made the new stage 2 the default. Fixed small memory leak. ------------------------------------------------------------------------ r1118 | zimmerma | 2008-02-18 18:36:44 +0100 (Mon, 18 Feb 2008) | 2 lines Changed paths: M /trunk/README.dev A /trunk/testlong.pp1 added new (long) test file for P+1 ------------------------------------------------------------------------ r1117 | zimmerma | 2008-02-18 14:38:34 +0100 (Mon, 18 Feb 2008) | 3 lines Changed paths: M /trunk/test.pp1 added test cases that exhibit bug in 6.1.3 with polynomials of degree > 1 (and maybe in earlier versions) ------------------------------------------------------------------------ r1116 | kruppa | 2008-02-18 11:46:12 +0100 (Mon, 18 Feb 2008) | 2 lines Changed paths: M /trunk/techdocs/buildpoly.tex Something about converting polynomial bases, forgot what's it about by now ------------------------------------------------------------------------ r1115 | zimmerma | 2008-02-12 23:40:53 +0100 (Tue, 12 Feb 2008) | 2 lines Changed paths: M /trunk/main.c update P+1 top-ten bound ------------------------------------------------------------------------ r1114 | jasonp | 2008-01-30 08:04:04 +0100 (Wed, 30 Jan 2008) | 1 line Changed paths: M /trunk/Makefile.am allow 'make check' to work in MinGW ------------------------------------------------------------------------ r1113 | kruppa | 2008-01-29 16:57:28 +0100 (Tue, 29 Jan 2008) | 4 lines Changed paths: M /trunk/pm1fs2.c Fixed small memory leak (S_2). Allocate enough memory for mpz_t's to avoid reallocs. ------------------------------------------------------------------------ r1112 | kruppa | 2008-01-29 16:54:45 +0100 (Tue, 29 Jan 2008) | 2 lines Changed paths: M /trunk/mpmod.c Fixed compiler warning: parantheses around assignment as truth value ------------------------------------------------------------------------ r1111 | kruppa | 2008-01-29 16:53:58 +0100 (Tue, 29 Jan 2008) | 2 lines Changed paths: M /trunk/mpzspm.c Minor changes: replace mpz_add by mpz_mul_2exp, changes to comments ------------------------------------------------------------------------ r1110 | kruppa | 2008-01-23 18:09:04 +0100 (Wed, 23 Jan 2008) | 2 lines Changed paths: M /trunk/mpzspv.c Multi-threading pragmas for mpzspv_from_mpzv() ------------------------------------------------------------------------ r1109 | kruppa | 2008-01-21 23:20:26 +0100 (Mon, 21 Jan 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c A little more parallelism in ntt_sqr_recip() to improve timings ------------------------------------------------------------------------ r1108 | kruppa | 2008-01-21 20:11:24 +0100 (Mon, 21 Jan 2008) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/mpzspv.c M /trunk/pm1fs2.c M /trunk/sp.h Some more parallelization to get nicer timings for the final paper ------------------------------------------------------------------------ r1107 | kruppa | 2008-01-21 20:10:57 +0100 (Mon, 21 Jan 2008) | 3 lines Changed paths: M /trunk/mpmod.c ECM_MOD_MPZ reduction uses aux_modulus now, but that was not copied by mpmod_copy(). Fixed. ------------------------------------------------------------------------ r1106 | kruppa | 2008-01-21 14:57:23 +0100 (Mon, 21 Jan 2008) | 2 lines Changed paths: A /trunk/phiP.gp A pari script to make P values for new stage 2 ------------------------------------------------------------------------ r1105 | kruppa | 2008-01-21 01:16:13 +0100 (Mon, 21 Jan 2008) | 4 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c M /trunk/sp.h Changed ntt_sqr_recip() to use Montgomery's idea of using a primitive 3rd root of unity for the weight signal. ------------------------------------------------------------------------ r1104 | zimmerma | 2008-01-17 08:52:20 +0100 (Thu, 17 Jan 2008) | 2 lines Changed paths: M /trunk/mpmod.c fixed a bug in mpres_mpz_mod when n=1 ------------------------------------------------------------------------ r1103 | zimmerma | 2008-01-16 11:56:32 +0100 (Wed, 16 Jan 2008) | 2 lines Changed paths: M /trunk/mpmod.c fixed bug in mpres_mpz_mod in case T has more than 2n limbs ------------------------------------------------------------------------ r1102 | zimmerma | 2008-01-16 10:42:19 +0100 (Wed, 16 Jan 2008) | 5 lines Changed paths: M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/ecm2.c M /trunk/main.c M /trunk/mpmod.c M /trunk/pp1.c mpmod_init code was duplicated: now call mpmod_init directly this also solved some inconsistencies in arithmetic options: in some cases ECM did not use mpz_mod although -mpzmod was given added new algorithm which speeds up -mpzmod arithmetic ------------------------------------------------------------------------ r1101 | kruppa | 2008-01-15 18:02:11 +0100 (Tue, 15 Jan 2008) | 2 lines Changed paths: M /trunk/mpzspm.c M /trunk/spm.c Allow computation of roots of unity whose order isn't a power of 2 ------------------------------------------------------------------------ r1100 | zimmerma | 2008-01-15 15:45:23 +0100 (Tue, 15 Jan 2008) | 3 lines Changed paths: M /trunk/TODO A /trunk/TODO.kunz added suggestions from Thomas Kunz, to make it easier to port GMP-ECM to specific architectures ------------------------------------------------------------------------ r1099 | kruppa | 2008-01-14 15:53:22 +0100 (Mon, 14 Jan 2008) | 3 lines Changed paths: M /trunk/pm1fs2.c Fixed bug in computation of g sequence for P-1 with many threads: negative value could get assigned to unsigned long. ------------------------------------------------------------------------ r1098 | kruppa | 2008-01-13 11:38:53 +0100 (Sun, 13 Jan 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Modified ntt_sqr_recip to work as described in the paper, parallelized it ------------------------------------------------------------------------ r1097 | zimmerma | 2008-01-13 10:21:21 +0100 (Sun, 13 Jan 2008) | 2 lines Changed paths: M /trunk/TODO A /trunk/TODO.fat added suggestion from Peter Montgomery ------------------------------------------------------------------------ r1096 | kruppa | 2008-01-12 15:37:28 +0100 (Sat, 12 Jan 2008) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c If NTT is used, ensure s_1 < lmax/2 so that poly degrees stay just below a power of two while bulding F. ------------------------------------------------------------------------ r1095 | kruppa | 2008-01-11 20:37:24 +0100 (Fri, 11 Jan 2008) | 2 lines Changed paths: M /trunk/techdocs/mulrecip.tex More details on DWT mul for RLPs ------------------------------------------------------------------------ r1094 | kruppa | 2008-01-10 19:47:02 +0100 (Thu, 10 Jan 2008) | 2 lines Changed paths: A /trunk/techdocs/mulrecip.tex D /trunk/techdocs/mulrecipdwt.tex Added something on multiplying RLP without DWT/NTT ------------------------------------------------------------------------ r1093 | kruppa | 2008-01-10 19:44:08 +0100 (Thu, 10 Jan 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c Rewrote list_mul_reciprocal() to use less temp memory. ------------------------------------------------------------------------ r1092 | kruppa | 2008-01-09 22:44:22 +0100 (Wed, 09 Jan 2008) | 4 lines Changed paths: M /trunk/Makefile.am A /trunk/auxarith.c M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/ecm2.c M /trunk/pm1fs2.c Moved functions for simple unsigned long arithmetic to auxarith.c Added function for squaring an RLP with a discrete weighted NTT of half length. ------------------------------------------------------------------------ r1091 | kruppa | 2008-01-07 23:04:55 +0100 (Mon, 07 Jan 2008) | 2 lines Changed paths: M /trunk/techdocs/mulrecipdwt.tex Small changes ------------------------------------------------------------------------ r1090 | kruppa | 2008-01-07 22:45:01 +0100 (Mon, 07 Jan 2008) | 2 lines Changed paths: A /trunk/techdocs/mulrecipdwt.tex A note on (hopefully) multiplying RLPs with a weighted FFT ------------------------------------------------------------------------ r1089 | kruppa | 2008-01-07 22:43:23 +0100 (Mon, 07 Jan 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c New function for squaring RLPs, simpler than general multiply one ------------------------------------------------------------------------ r1088 | kruppa | 2008-01-07 22:41:13 +0100 (Mon, 07 Jan 2008) | 2 lines Changed paths: M /trunk/pm1fs2.c M /trunk/sets_long.c Some more cleanups ------------------------------------------------------------------------ r1087 | kruppa | 2008-01-06 22:12:24 +0100 (Sun, 06 Jan 2008) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1fs2.c M /trunk/sets_long.c M /trunk/x86_64/README M /trunk/x86_64/test_mulredc.c More cleanup and bugfixes ------------------------------------------------------------------------ r1086 | kruppa | 2008-01-02 11:22:55 +0100 (Wed, 02 Jan 2008) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/ecm-impl.h M /trunk/pm1fs2.c M /trunk/sets_long.c More code cleanup in sets_long.c, made pm1fs2.c use those functions ------------------------------------------------------------------------ r1085 | zimmerma | 2007-12-25 17:20:25 +0100 (Tue, 25 Dec 2007) | 2 lines Changed paths: M /trunk/main.c updated champions thresholds ------------------------------------------------------------------------ r1084 | kruppa | 2007-12-21 16:20:41 +0100 (Fri, 21 Dec 2007) | 3 lines Changed paths: M /trunk/ecm-impl.h A /trunk/sets_long.c Rewrote most of the operations on sets of longs for clarity, moved them into own source file. ------------------------------------------------------------------------ r1083 | kruppa | 2007-12-20 10:37:57 +0100 (Thu, 20 Dec 2007) | 2 lines Changed paths: M /trunk/mpmod.c Fixed bug in mpmod_copy() with 2^n-1 numbers. ------------------------------------------------------------------------ r1082 | kruppa | 2007-12-19 11:53:59 +0100 (Wed, 19 Dec 2007) | 4 lines Changed paths: M /trunk/mul_fft.c Fixed unparenthesized parameter in __GMP_ALLOCATE_FUNC_LIMBS macro. Changed copy of inputs in mpn_mul_fft_aux() to use malloc instead of alloca. Added ASSERT != NULL to temp space allocs. ------------------------------------------------------------------------ r1081 | kruppa | 2007-12-18 11:37:01 +0100 (Tue, 18 Dec 2007) | 2 lines Changed paths: M /trunk/factor.c Updated default parameters so ugly hack works outside of GMP-ECM ------------------------------------------------------------------------ r1080 | kruppa | 2007-12-18 11:00:28 +0100 (Tue, 18 Dec 2007) | 2 lines Changed paths: M /trunk/factor.c M /trunk/main.c Added ugly hack to pass B2scale parameter to library ------------------------------------------------------------------------ r1079 | kruppa | 2007-12-17 21:44:14 +0100 (Mon, 17 Dec 2007) | 2 lines Changed paths: M /trunk/x86_64/README Updated README for new m4 script. ------------------------------------------------------------------------ r1078 | zimmerma | 2007-12-17 14:25:56 +0100 (Mon, 17 Dec 2007) | 2 lines Changed paths: M /trunk/TODO reorganized, added a table of contents, and added a section "installation" ------------------------------------------------------------------------ r1077 | kruppa | 2007-12-17 13:38:13 +0100 (Mon, 17 Dec 2007) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/configure.in M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm If GMP is linked statically, link mpmod.o and GMP first to put speed critical functions close together, hoping to avoid cache collisions. ------------------------------------------------------------------------ r1076 | zimmerma | 2007-12-14 21:45:36 +0100 (Fri, 14 Dec 2007) | 4 lines Changed paths: M /trunk/Makefile.am Hard-coded compilation line for alternate binary (ecm2) with speed-critical routines close together. Should be removed once somebody figures out how to do this properly. ------------------------------------------------------------------------ r1075 | kruppa | 2007-12-13 15:24:22 +0100 (Thu, 13 Dec 2007) | 2 lines Changed paths: M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm More optimization of mulredc, up to 4% faster ------------------------------------------------------------------------ r1074 | kruppa | 2007-12-05 16:08:00 +0100 (Wed, 05 Dec 2007) | 4 lines Changed paths: M /trunk/mul_fft.c Some functions caused symbol conflict when linking GMP statically. Made those functions "static" in mul_fft.c, as they do not seem to be used outside of that file. ------------------------------------------------------------------------ r1073 | kruppa | 2007-12-05 15:25:20 +0100 (Wed, 05 Dec 2007) | 2 lines Changed paths: M /trunk/x86_64/bench.c M /trunk/x86_64/mulredc.m4 M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm M /trunk/x86_64/test_mulredc.c Fixed comments to match code ------------------------------------------------------------------------ r1072 | kruppa | 2007-12-05 10:19:31 +0100 (Wed, 05 Dec 2007) | 3 lines Changed paths: M /trunk/mpmod.c Added an assertion to modmul_basecase which compares results with redc_basecase. Enable with -DWANT_ASSERT_EXPENSIVE ------------------------------------------------------------------------ r1071 | kruppa | 2007-12-05 10:10:32 +0100 (Wed, 05 Dec 2007) | 2 lines Changed paths: M /trunk/ntt_gfp.c Added DCT function, but does not work correctly yet - output is not a DCT-II ------------------------------------------------------------------------ r1070 | kruppa | 2007-12-05 10:09:19 +0100 (Wed, 05 Dec 2007) | 3 lines Changed paths: M /trunk/x86_64/mulredc.m4 Fixed m4 quotes to allow generation of .asm files (which get processed by m4 again during compilation) ------------------------------------------------------------------------ r1069 | kruppa | 2007-12-04 18:19:36 +0100 (Tue, 04 Dec 2007) | 4 lines Changed paths: M /trunk/x86_64/mulredc10.asm M /trunk/x86_64/mulredc11.asm M /trunk/x86_64/mulredc12.asm M /trunk/x86_64/mulredc13.asm M /trunk/x86_64/mulredc14.asm M /trunk/x86_64/mulredc15.asm M /trunk/x86_64/mulredc16.asm M /trunk/x86_64/mulredc17.asm M /trunk/x86_64/mulredc18.asm M /trunk/x86_64/mulredc19.asm M /trunk/x86_64/mulredc20.asm M /trunk/x86_64/mulredc3.asm M /trunk/x86_64/mulredc4.asm M /trunk/x86_64/mulredc5.asm M /trunk/x86_64/mulredc6.asm M /trunk/x86_64/mulredc7.asm M /trunk/x86_64/mulredc8.asm M /trunk/x86_64/mulredc9.asm Assembler files generated with m4 -DLENGTH=3 mulredc.m4 > mulredc3.asm etc. ------------------------------------------------------------------------ r1068 | kruppa | 2007-12-04 17:47:21 +0100 (Tue, 04 Dec 2007) | 2 lines Changed paths: M /trunk/x86_64/mulredc.m4 Somewhat faster. Speedup over Python script generated code is 7-10%. ------------------------------------------------------------------------ r1067 | zimmerma | 2007-12-04 09:39:32 +0100 (Tue, 04 Dec 2007) | 4 lines Changed paths: M /trunk/configure.in M /trunk/ecm-gmp.h M /trunk/ks-multiply.c M /trunk/listz.c M /trunk/mpmod.c M /trunk/schoen_strass.c do not use any more GMP's mpn_mul_fft (which was not public), and always use instead ecm_mpn_mul_fft (included in GMP-ECM, and faster). Yields small speedup for Fermat numbers. ------------------------------------------------------------------------ r1066 | zimmerma | 2007-12-03 21:07:30 +0100 (Mon, 03 Dec 2007) | 5 lines Changed paths: M /trunk/configure.in M /trunk/ecm-params.h.athlon64 A /trunk/ecm-params.h.pentium3 M /trunk/mul_fft.c configure.in: added tuning (ecm-params) for pentium3 mul_fft.c: added default values of parameters ecm-params.h.pentium3: new file with tuned values for pentium M ecm-params.h.athlon64: removed useless values ------------------------------------------------------------------------ r1065 | kruppa | 2007-12-03 21:04:20 +0100 (Mon, 03 Dec 2007) | 4 lines Changed paths: A /trunk/x86_64/mulredc.m4 Rewrite of ASM-generating script for mulredc, this time written in m4. Produces slightly faster (on Opteron) code than the old one, probably can be improved yet. ------------------------------------------------------------------------ r1064 | zimmerma | 2007-12-03 18:08:44 +0100 (Mon, 03 Dec 2007) | 2 lines Changed paths: A /trunk/mul_fft.c new FFT code, adapted for GMP-ECM ------------------------------------------------------------------------ r1063 | zimmerma | 2007-12-03 18:01:36 +0100 (Mon, 03 Dec 2007) | 4 lines Changed paths: M /trunk/Makefile.am M /trunk/ecm-params.h.athlon64 M /trunk/ks-multiply.c incorporated new FFT code into GMP-ECM: yields nice speedup in stage 2 with -no-ntt. Works so far only on x86_64, still remains to create parameter files on other architectures. ------------------------------------------------------------------------ r1062 | zimmerma | 2007-12-02 22:41:26 +0100 (Sun, 02 Dec 2007) | 2 lines Changed paths: M /trunk/TODO added efficiency item ------------------------------------------------------------------------ r1061 | kruppa | 2007-11-26 17:25:28 +0100 (Mon, 26 Nov 2007) | 3 lines Changed paths: M /trunk/countsmooth.c M /trunk/ecm-ecm.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/eval.c M /trunk/getprime.c M /trunk/pm1.c M /trunk/pp1.c M /trunk/trial.c P-1 and P+1 now skip from sqrt(B1) to B1done in stage 1 when resuming. Large speedup when increasing large B1 in small steps. ------------------------------------------------------------------------ r1060 | kruppa | 2007-11-26 16:57:42 +0100 (Mon, 26 Nov 2007) | 3 lines Changed paths: M /trunk/pm1fs2.c More P values, malloc() for spv's in parallel region (cpu binding tbd), some changes to comments. ------------------------------------------------------------------------ r1059 | kruppa | 2007-11-16 18:18:30 +0100 (Fri, 16 Nov 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Parallelized P+1 sequence g and DCT of h ------------------------------------------------------------------------ r1058 | zimmerma | 2007-11-16 15:42:26 +0100 (Fri, 16 Nov 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c removed space before 'ms' to be coherent with previous versions of GMP-ECM ------------------------------------------------------------------------ r1057 | kruppa | 2007-11-16 15:04:50 +0100 (Fri, 16 Nov 2007) | 3 lines Changed paths: M /trunk/configure.in M /trunk/mpmod.c M /trunk/pm1fs2.c Some parallelization in the new P+-1 stage 2. Enable with --enable-openmp. Building f, sequence h for P-1, and sequences g and h for P+1 are TBD. ------------------------------------------------------------------------ r1056 | kruppa | 2007-11-09 17:24:26 +0100 (Fri, 09 Nov 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c Added mpmod_copy() to clone a mpmod_t, i.e. for threads ------------------------------------------------------------------------ r1055 | kruppa | 2007-11-08 18:45:59 +0100 (Thu, 08 Nov 2007) | 3 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1fs2.c M /trunk/pp1.c Added P+1 fast stage 2 variant that generates the coordinates of g one at a time. This saves about 30% memory. ------------------------------------------------------------------------ r1054 | kruppa | 2007-11-03 19:42:16 +0100 (Sat, 03 Nov 2007) | 3 lines Changed paths: M /trunk/ecm.c M /trunk/pm1.c M /trunk/pp1.c Fixed bug where resuming and immediately interrupting would produce a lower B1 value in save file than was in input file. ------------------------------------------------------------------------ r1053 | kruppa | 2007-10-30 23:53:18 +0100 (Tue, 30 Oct 2007) | 2 lines Changed paths: M /trunk/main.c Install signal handler only if a save file was specified. ------------------------------------------------------------------------ r1052 | kruppa | 2007-10-30 23:50:18 +0100 (Tue, 30 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed parameters selection, was slow and produced suboptimal parameters. ------------------------------------------------------------------------ r1051 | kruppa | 2007-10-30 17:52:30 +0100 (Tue, 30 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixes bug in parameter selection, loop condition was wrong ------------------------------------------------------------------------ r1050 | kruppa | 2007-10-30 00:15:24 +0100 (Tue, 30 Oct 2007) | 2 lines Changed paths: M /trunk/mpzspv.c M /trunk/pm1fs2.c Cleanups. Convert from NTT in MPZSPV_NORMALISE_STRIDE blocks. ------------------------------------------------------------------------ r1049 | kruppa | 2007-10-29 16:54:27 +0100 (Mon, 29 Oct 2007) | 2 lines Changed paths: M /trunk/mpzspv.c M /trunk/pm1fs2.c Fixes bug where negative value in mpz_t was passed to mpzspv_to_ntt(). ------------------------------------------------------------------------ r1048 | kruppa | 2007-10-29 15:11:28 +0100 (Mon, 29 Oct 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c Restored non-NTT P-1 stage 2. Small cleanups. ------------------------------------------------------------------------ r1047 | zimmerma | 2007-10-29 14:22:00 +0100 (Mon, 29 Oct 2007) | 2 lines Changed paths: M /trunk/main.c updated champion size for P-1 ------------------------------------------------------------------------ r1046 | kruppa | 2007-10-27 18:42:22 +0200 (Sat, 27 Oct 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpzspv.c M /trunk/pm1fs2.c M /trunk/pp1.c M /trunk/sp.h Added NTT variant of new P+1 stage 2 ------------------------------------------------------------------------ r1045 | kruppa | 2007-10-27 18:37:50 +0200 (Sat, 27 Oct 2007) | 2 lines Changed paths: A /trunk/makesmooth.gp PARI script for generating test numbers for P-1 and P+1 ------------------------------------------------------------------------ r1044 | kruppa | 2007-10-24 18:22:57 +0200 (Wed, 24 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Reduced temp memory use in list_mul_symmetric() ------------------------------------------------------------------------ r1043 | kruppa | 2007-10-23 15:52:20 +0200 (Tue, 23 Oct 2007) | 3 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c In new P-1 stage 2, only the lmax/2+1 distinct coefficients of the NTT of h are stored now. ------------------------------------------------------------------------ r1042 | kruppa | 2007-10-22 17:42:21 +0200 (Mon, 22 Oct 2007) | 2 lines Changed paths: M /trunk/mpzspm.c M /trunk/pm1fs2.c Small fix and cleanup of pm1_sequence_[gh]. ------------------------------------------------------------------------ r1041 | kruppa | 2007-10-22 05:00:35 +0200 (Mon, 22 Oct 2007) | 3 lines Changed paths: M /trunk/mpzspv.c M /trunk/pm1fs2.c M /trunk/sp.c M /trunk/sp.h M /trunk/spv.c New P-1 stage 2 changed to use NTT for convolution product. Beware: there is a bug, sometimes misses factors. To be fixed. ------------------------------------------------------------------------ r1040 | kruppa | 2007-10-19 17:25:10 +0200 (Fri, 19 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Rewrite of pp1_sequence_g() to use only 5 multiplications per g_i ------------------------------------------------------------------------ r1039 | kruppa | 2007-10-18 15:04:15 +0200 (Thu, 18 Oct 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Take -k parameter into account when choosing parameters for new stage 2 ------------------------------------------------------------------------ r1038 | kruppa | 2007-10-18 15:02:05 +0200 (Thu, 18 Oct 2007) | 2 lines Changed paths: M /trunk/main.c Print hostname of machine it's running on in verbose mode ------------------------------------------------------------------------ r1037 | kruppa | 2007-10-17 02:13:16 +0200 (Wed, 17 Oct 2007) | 3 lines Changed paths: M /trunk/pm1fs2.c Function for exponentiating in extension ring can take mpz_t now, fixes an unsigned long overflow ------------------------------------------------------------------------ r1036 | kruppa | 2007-10-16 16:02:30 +0200 (Tue, 16 Oct 2007) | 4 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c P+1 does gcd on first coordinate in extension ring of product polynomial now, this fixes the problem with getting 0 in the second coordinate in the last point of evaluation and when m_1 is negative. ------------------------------------------------------------------------ r1035 | zimmerma | 2007-10-13 20:08:15 +0200 (Sat, 13 Oct 2007) | 3 lines Changed paths: M /trunk/eval.c got rid of quadratic memory reallocation in main eval routine (thanks to Alban Nonymous) ------------------------------------------------------------------------ r1034 | zimmerma | 2007-10-11 22:26:05 +0200 (Thu, 11 Oct 2007) | 2 lines Changed paths: M /trunk/auxi.c fixed copyright line ------------------------------------------------------------------------ r1033 | zimmerma | 2007-10-11 22:15:09 +0200 (Thu, 11 Oct 2007) | 2 lines Changed paths: M /trunk/auxi.c fixed efficiency issue in nb_digits: cost was O(n^2) for n-digit input ------------------------------------------------------------------------ r1032 | kruppa | 2007-10-10 13:58:51 +0200 (Wed, 10 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Sped up computation of g_i sequence for P-1 ------------------------------------------------------------------------ r1031 | kruppa | 2007-10-09 17:39:27 +0200 (Tue, 09 Oct 2007) | 4 lines Changed paths: M /trunk/ks-multiply.c Changed max(deg(A)+1, deg(B)+1) to min(...) in estimate of product coeff size. Added code to print message if resulting FFT size differs, currently disabled (enable by #define-ing TEST_OLD_S) ------------------------------------------------------------------------ r1030 | kruppa | 2007-10-09 12:02:13 +0200 (Tue, 09 Oct 2007) | 4 lines Changed paths: M /trunk/pm1fs2.c Sped up finding parameters for large lmax and small s_1. Timing output for h_i and g_i sequences. ------------------------------------------------------------------------ r1029 | zimmerma | 2007-10-06 18:13:47 +0200 (Sat, 06 Oct 2007) | 2 lines Changed paths: M /trunk/mpmod.c fixed typos in comments ------------------------------------------------------------------------ r1028 | kruppa | 2007-10-06 15:17:58 +0200 (Sat, 06 Oct 2007) | 2 lines Changed paths: M /trunk/mpmod.c mpres_mul_z_to_z() now always produces non-negative, fully reduced result ------------------------------------------------------------------------ r1027 | kruppa | 2007-10-06 15:17:17 +0200 (Sat, 06 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c P+1 fast stage 2 aborts if m_1 < 0 until bug is fixed ------------------------------------------------------------------------ r1026 | kruppa | 2007-10-06 14:44:11 +0200 (Sat, 06 Oct 2007) | 2 lines Changed paths: M /trunk/pp1.c Print parameters correctly for new stage 2 ------------------------------------------------------------------------ r1025 | kruppa | 2007-10-05 21:29:18 +0200 (Fri, 05 Oct 2007) | 3 lines Changed paths: M /trunk/pm1fs2.c Fixed last step in P+1 stage 2 (accumulating product) which always has 0 in last term, causing N to be found as factor. TODO: find out why ------------------------------------------------------------------------ r1024 | kruppa | 2007-10-05 21:08:06 +0200 (Fri, 05 Oct 2007) | 3 lines Changed paths: M /trunk/test.pp1 Fixed test where 3^2-4 was a QR so P+1 really did P-1 (which just happened to work as well with the old code). Uses x0=6 now. ------------------------------------------------------------------------ r1023 | kruppa | 2007-10-05 19:32:33 +0200 (Fri, 05 Oct 2007) | 2 lines Changed paths: M /trunk/pp1.c Changed lmax to 2^20. ------------------------------------------------------------------------ r1022 | kruppa | 2007-10-05 17:56:05 +0200 (Fri, 05 Oct 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c M /trunk/pp1.c Fast P+1 stage 2 stared working. More testing and optimization needed. ------------------------------------------------------------------------ r1021 | kruppa | 2007-10-04 17:54:53 +0200 (Thu, 04 Oct 2007) | 3 lines Changed paths: M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Generating sequences g and h for P+1 (hopefully) works now. Not well optimized yet. ------------------------------------------------------------------------ r1020 | kruppa | 2007-09-27 18:35:11 +0200 (Thu, 27 Sep 2007) | 3 lines Changed paths: M /trunk/pm1fs2.c Fixed bug in maxS(). Fixed bugs in gfp_ext_rn2(). Extended table of P values. Started pp1fs2() function for P+1 stage 2. ------------------------------------------------------------------------ r1019 | kruppa | 2007-09-25 16:15:26 +0200 (Tue, 25 Sep 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/pm1.c M /trunk/pm1fs2.c Choose and print parameters for new P-1 stage 2 at start of pm1(). ------------------------------------------------------------------------ r1018 | kruppa | 2007-09-25 16:07:47 +0200 (Tue, 25 Sep 2007) | 2 lines Changed paths: M /trunk/mpmod.c Moved MPZ_REALLOC from ecm_mulredc_basecase to mpres_mul* functions ------------------------------------------------------------------------ r1017 | kruppa | 2007-09-25 14:53:20 +0200 (Tue, 25 Sep 2007) | 3 lines Changed paths: M /trunk/mpmod.c Made ecm_mulredc_basecase() reallocate space in R instead of failing an assertion ------------------------------------------------------------------------ r1016 | kruppa | 2007-09-24 14:53:34 +0200 (Mon, 24 Sep 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed bug in computing max(S_1 + S_2) during parameter selection. ------------------------------------------------------------------------ r1015 | kruppa | 2007-09-23 22:33:13 +0200 (Sun, 23 Sep 2007) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/ecm_ntt.c M /trunk/listz.c M /trunk/mpmod.c M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/polyeval.c Fast P-1 stage 2 now uses parameterization as in the paper. ------------------------------------------------------------------------ r1014 | zimmerma | 2007-09-14 08:48:50 +0200 (Fri, 14 Sep 2007) | 5 lines Changed paths: M /trunk/ChangeLog M /trunk/Makefile.am M /trunk/README.dev M /trunk/configure.in ChangeLog: added changes since release 6.1 that were missing README.dev: added hint about man page Makefile.am: ensure that ecm.1 is in the tarball configure.in: fixed warning from autoconf ------------------------------------------------------------------------ r1013 | kruppa | 2007-09-10 14:15:00 +0200 (Mon, 10 Sep 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Added code for computing r^(n^2) over quadratic extension ring ------------------------------------------------------------------------ r1012 | kruppa | 2007-09-10 14:13:47 +0200 (Mon, 10 Sep 2007) | 2 lines Changed paths: M /trunk/NEWS Changes for 6.1.2 and 6.1.3 ------------------------------------------------------------------------ r1011 | zimmerma | 2007-09-10 09:16:45 +0200 (Mon, 10 Sep 2007) | 3 lines Changed paths: M /trunk/mpmod.c M /trunk/pm1fs2.c mpmod.c: patch from Alex (wrong initialization in mpmod_init_MPZ) pm1fs2.c: default is now method=1 instead of method=0 ------------------------------------------------------------------------ r1010 | zimmerma | 2007-09-07 17:05:31 +0200 (Fri, 07 Sep 2007) | 2 lines Changed paths: M /trunk/mpmod.c use Mulder's algorithm (ecm_redc_n) in REDC only when xn=2n ------------------------------------------------------------------------ r1009 | zimmerma | 2007-09-07 16:44:56 +0200 (Fri, 07 Sep 2007) | 2 lines Changed paths: M /trunk/TODO added analysis of slowdown reported by Ch. Clavier ------------------------------------------------------------------------ r1008 | zimmerma | 2007-09-07 16:42:20 +0200 (Fri, 07 Sep 2007) | 4 lines Changed paths: M /trunk/mpmod.c modified ecm_redc_n to allow xn = 2n-1 too (happens often when the high limb of the modulus has few bits); unfortunately for large inputs (e.g. the c58672 in TODO) this seems to be slower than the else-branch in REDC ------------------------------------------------------------------------ r1007 | zimmerma | 2007-09-07 14:24:17 +0200 (Fri, 07 Sep 2007) | 3 lines Changed paths: M /trunk/configure.in changed version number of development version to 6.2 (6.1.1 was already an existing release) ------------------------------------------------------------------------ r1006 | zimmerma | 2007-09-07 14:15:55 +0200 (Fri, 07 Sep 2007) | 2 lines Changed paths: M /trunk/ecm-gmp.h fixed incorrect comment ------------------------------------------------------------------------ r1005 | zimmerma | 2007-09-07 14:10:00 +0200 (Fri, 07 Sep 2007) | 3 lines Changed paths: M /trunk/INSTALL M /trunk/ecm-gmp.h INSTALL: updated GMP web page and version ecm-gmp.h: mpn_mul_fft now returns int (>= GMP 4.2.1) ------------------------------------------------------------------------ r1004 | kruppa | 2007-09-04 16:44:51 +0200 (Tue, 04 Sep 2007) | 2 lines Changed paths: M /trunk/mpmod.c Corrected bugfix for using mpn_mul_fft(). ------------------------------------------------------------------------ r1003 | kruppa | 2007-09-03 12:22:00 +0200 (Mon, 03 Sep 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Fixed some compiler warnings about unused variables and functions. ------------------------------------------------------------------------ r1002 | kruppa | 2007-08-31 19:35:28 +0200 (Fri, 31 Aug 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c Marked input arguments of funcions "const". ------------------------------------------------------------------------ r1001 | kruppa | 2007-08-31 19:34:40 +0200 (Fri, 31 Aug 2007) | 2 lines Changed paths: M /trunk/ecm_ntt.c Fixed warning the right way this time ------------------------------------------------------------------------ r1000 | kruppa | 2007-08-31 19:33:54 +0200 (Fri, 31 Aug 2007) | 2 lines Changed paths: M /trunk/ecm.c Made return values use the FACTOR_FOUND defines ------------------------------------------------------------------------ r999 | kruppa | 2007-08-30 22:34:36 +0200 (Thu, 30 Aug 2007) | 2 lines Changed paths: M /trunk/ecm_ntt.c Circumvent compiler warnings ------------------------------------------------------------------------ r998 | kruppa | 2007-08-30 16:42:49 +0200 (Thu, 30 Aug 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c M /trunk/pm1fs2.c M /trunk/test.pm1 Some code rearrangements in preparation for fast P+1 stage 2 ------------------------------------------------------------------------ r997 | zimmerma | 2007-08-29 16:46:17 +0200 (Wed, 29 Aug 2007) | 3 lines Changed paths: M /trunk/TODO M /trunk/ecm2.c M /trunk/sp.h M /trunk/tune.c TODO: added item (efficiency regression) tune.c, ecm2.c, sp.h: fixed compiler warnings with -W -Wall ------------------------------------------------------------------------ r996 | kruppa | 2007-08-24 15:48:33 +0200 (Fri, 24 Aug 2007) | 3 lines Changed paths: M /trunk/pm1.c Fixes bug reported by P.L.Montgomery: B1 was converted from double to unsigned long before assigning it to B2min, causing truncation. ------------------------------------------------------------------------ r995 | zimmerma | 2007-08-01 17:57:38 +0200 (Wed, 01 Aug 2007) | 2 lines Changed paths: M /trunk/TODO updated reference ------------------------------------------------------------------------ r994 | zimmerma | 2007-07-31 13:52:59 +0200 (Tue, 31 Jul 2007) | 2 lines Changed paths: M /trunk/TODO added pointer to new algorithm ------------------------------------------------------------------------ r993 | zimmerma | 2007-07-22 13:24:41 +0200 (Sun, 22 Jul 2007) | 2 lines Changed paths: M /trunk/ecm.h applied patch from Emmanuel Thome to use the library mode from a C++ program ------------------------------------------------------------------------ r992 | zimmerma | 2007-07-15 10:22:57 +0200 (Sun, 15 Jul 2007) | 2 lines Changed paths: M /trunk/main.c updated P-1 champion size ------------------------------------------------------------------------ r991 | kruppa | 2007-06-18 13:01:29 +0200 (Mon, 18 Jun 2007) | 2 lines Changed paths: M /trunk/stage2.c Fixes incorrect memory estimate for stage 2 ------------------------------------------------------------------------ r990 | zimmerma | 2007-06-16 21:19:05 +0200 (Sat, 16 Jun 2007) | 2 lines Changed paths: M /trunk/main.c fixed bug #3448: better check for invalid B2 ------------------------------------------------------------------------ r989 | zimmerma | 2007-06-05 18:33:29 +0200 (Tue, 05 Jun 2007) | 2 lines Changed paths: M /trunk/eval.c fixed bug #3363 (Expression parser needs to check for remainder in division) ------------------------------------------------------------------------ r988 | kruppa | 2007-04-22 17:06:41 +0200 (Sun, 22 Apr 2007) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/mpmod.c M /trunk/pm1fs2.c Speedups and cleanups for fast stage 2 code. ------------------------------------------------------------------------ r987 | kruppa | 2007-04-22 17:05:54 +0200 (Sun, 22 Apr 2007) | 3 lines Changed paths: M /trunk/median.c Removed unneccesary recursive calls in TToomCookMul_space() which could inflate run-time considerably for degenerate cases. ------------------------------------------------------------------------ r986 | kruppa | 2007-04-22 17:04:23 +0200 (Sun, 22 Apr 2007) | 2 lines Changed paths: M /trunk/ks-multiply.c Added ASSERTS to check that input coefficients are non-negative. ------------------------------------------------------------------------ r985 | kruppa | 2007-04-16 12:57:28 +0200 (Mon, 16 Apr 2007) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/pm1fs2.c Building symmetric polynomial as in draft 8.1 implemented. Some clean up work left to be done yet. Added Makefile target for pm1fs2 test drive binary. ------------------------------------------------------------------------ r984 | kruppa | 2007-03-30 14:43:19 +0200 (Fri, 30 Mar 2007) | 2 lines Changed paths: M /trunk/techdocs/buildpoly.tex Fixes. Added case p^k | n, i.e. n not squarefree. ------------------------------------------------------------------------ r983 | kruppa | 2007-03-28 19:05:08 +0200 (Wed, 28 Mar 2007) | 2 lines Changed paths: M /trunk/Makefile.am M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/main.c M /trunk/mpmod.c M /trunk/pm1.c M /trunk/pm1fs2.c Implemented building F from arithmetic progressions of prime length. ------------------------------------------------------------------------ r982 | kruppa | 2007-03-19 18:25:05 +0100 (Mon, 19 Mar 2007) | 3 lines Changed paths: M /trunk/techdocs/buildpoly.tex M /trunk/techdocs/convolv.tex More details on sets of coprime residues in buildpoly.tex. Small corrections and additions in convolv.tex. ------------------------------------------------------------------------ r981 | zimmerma | 2007-03-19 08:23:00 +0100 (Mon, 19 Mar 2007) | 2 lines Changed paths: M /trunk/main.c updated champion sizes ------------------------------------------------------------------------ r980 | kruppa | 2007-03-16 15:28:44 +0100 (Fri, 16 Mar 2007) | 2 lines Changed paths: M /trunk/techdocs/buildpoly.tex Small corrections, additions ------------------------------------------------------------------------ r979 | kruppa | 2007-03-15 21:06:05 +0100 (Thu, 15 Mar 2007) | 2 lines Changed paths: A /trunk/techdocs/buildpoly.tex A note on Montgomery's idea for fast building F from its roots ------------------------------------------------------------------------ r978 | kruppa | 2007-03-12 18:35:54 +0100 (Mon, 12 Mar 2007) | 2 lines Changed paths: M /trunk/techdocs/schoen_strass.tex Small fixes. ------------------------------------------------------------------------ r977 | kruppa | 2007-03-07 23:36:31 +0100 (Wed, 07 Mar 2007) | 2 lines Changed paths: M /trunk/pm1.c Removed duplicated "special division for 2^n+-1" message in P-1. ------------------------------------------------------------------------ r976 | zimmerma | 2007-03-07 18:56:25 +0100 (Wed, 07 Mar 2007) | 2 lines Changed paths: M /trunk/pm1fs2.c Output "Step 2 took ..." in normal mode (as in stage2.c) ------------------------------------------------------------------------ r975 | kruppa | 2007-03-07 10:41:57 +0100 (Wed, 07 Mar 2007) | 3 lines Changed paths: M /trunk/pm1fs2.c Print total stage 2 time. Fix memory allocation bug when modulus->bits < 0, i.e. for 2^n-1 numbers. ------------------------------------------------------------------------ r974 | kruppa | 2007-03-06 11:25:22 +0100 (Tue, 06 Mar 2007) | 2 lines Changed paths: M /trunk/schoen_strass.c Make transposed Karatsuba return the number of multiplications used ------------------------------------------------------------------------ r973 | kruppa | 2007-03-06 10:43:51 +0100 (Tue, 06 Mar 2007) | 2 lines Changed paths: A /trunk/techdocs A /trunk/techdocs/convolv.tex A /trunk/techdocs/curve_convert.tex A /trunk/techdocs/schoen_strass.tex Notes on some of the math and algoritms used in GMP-ECM ------------------------------------------------------------------------ r972 | kruppa | 2007-03-05 18:37:11 +0100 (Mon, 05 Mar 2007) | 2 lines Changed paths: M /trunk/bestd.c M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/mpmod.c M /trunk/pm1.c M /trunk/pm1fs2.c M /trunk/pp1.c Debugging and speedups for Fast P-1 stage 2 ------------------------------------------------------------------------ r971 | kruppa | 2007-03-05 18:35:38 +0100 (Mon, 05 Mar 2007) | 2 lines Changed paths: M /trunk/schoen_strass.c Debugging for lenA == lenB/2+1 case ------------------------------------------------------------------------ r970 | kruppa | 2007-03-05 18:33:07 +0100 (Mon, 05 Mar 2007) | 2 lines Changed paths: M /trunk/median.c Added some ASSERT()s ------------------------------------------------------------------------ r969 | kruppa | 2007-03-05 18:31:58 +0100 (Mon, 05 Mar 2007) | 2 lines Changed paths: M /trunk/listz.c Added/corrected some ASSERT()s ------------------------------------------------------------------------ r968 | kruppa | 2007-02-26 11:21:33 +0100 (Mon, 26 Feb 2007) | 2 lines Changed paths: M /trunk/x86_64/mulredc2.asm Reverting this file to revision 908 ------------------------------------------------------------------------ r967 | kruppa | 2007-02-26 03:08:58 +0100 (Mon, 26 Feb 2007) | 3 lines Changed paths: M /trunk/Makefile.am M /trunk/ecm-impl.h M /trunk/ecm_ntt.c M /trunk/listz.c M /trunk/main.c M /trunk/median.c M /trunk/mpmod.c M /trunk/pm1.c A /trunk/pm1fs2.c M /trunk/schoen_strass.c M /trunk/stage2.c M /trunk/x86_64/bench.c M /trunk/x86_64/mulredc1.asm M /trunk/x86_64/mulredc2.asm Changes in preparation of a fast P-1 stage 2. Crude and incomplete implementation of fast P-1 stage 2. ------------------------------------------------------------------------ r966 | zimmerma | 2007-02-15 22:48:33 +0100 (Thu, 15 Feb 2007) | 2 lines Changed paths: M /trunk/ecm-params.h.core2 removed extra line break ------------------------------------------------------------------------ r965 | zimmerma | 2007-02-15 22:39:06 +0100 (Thu, 15 Feb 2007) | 2 lines Changed paths: A /trunk/ecm-params.h.core2 tuned parameters contributed by tom@womack.net ------------------------------------------------------------------------ r964 | zimmerma | 2007-02-05 18:15:21 +0100 (Mon, 05 Feb 2007) | 2 lines Changed paths: M /trunk/bestd.c extended again the d-table with good phi(d) ------------------------------------------------------------------------ r963 | zimmerma | 2007-02-05 18:01:21 +0100 (Mon, 05 Feb 2007) | 2 lines Changed paths: M /trunk/bestd.c extended table for d values with good phi(d) ------------------------------------------------------------------------ r962 | kruppa | 2007-02-04 00:17:46 +0100 (Sun, 04 Feb 2007) | 2 lines Changed paths: M /trunk/ecm.c Print point and curve in Weierstrass form at beginning of stage 2 with -v -v ------------------------------------------------------------------------ r961 | zimmerma | 2007-01-02 16:58:44 +0100 (Tue, 02 Jan 2007) | 2 lines Changed paths: A /trunk/c200 new test number (from 10^284+1) ------------------------------------------------------------------------ r960 | zimmerma | 2006-12-18 10:45:08 +0100 (Mon, 18 Dec 2006) | 2 lines Changed paths: M /trunk/AUTHORS added NTT contribution for Dave ------------------------------------------------------------------------ r959 | zimmerma | 2006-12-18 09:44:03 +0100 (Mon, 18 Dec 2006) | 2 lines Changed paths: M /trunk/COPYING M /trunk/COPYING.LIB M /trunk/Fgw.c M /trunk/auxi.c M /trunk/auxlib.c M /trunk/b1_ainc.c M /trunk/bestd.c M /trunk/bestdaux.c M /trunk/candi.c M /trunk/config.guess M /trunk/config.sub M /trunk/configfsf.guess M /trunk/configfsf.sub M /trunk/countsmooth.c M /trunk/ecm-ecm.h M /trunk/ecm-gmp.h M /trunk/ecm-impl.h M /trunk/ecm.c M /trunk/ecm.h M /trunk/ecm2.c M /trunk/ecm_ntt.c M /trunk/ecmfactor.c M /trunk/eval.c M /trunk/factor.c M /trunk/getprime.c M /trunk/ks-multiply.c M /trunk/listz.c M /trunk/longlong.h M /trunk/lucas.c M /trunk/main.c M /trunk/median.c M /trunk/memory.c M /trunk/mpmod.c M /trunk/mpzspm.c M /trunk/mpzspv.c M /trunk/mul_lo.c M /trunk/ntt_gfp.c M /trunk/pm1.c M /trunk/polyeval.c M /trunk/pp1.c M /trunk/random.c M /trunk/resume.c M /trunk/rho.c M /trunk/runecm2.c M /trunk/schoen_strass.c M /trunk/sp.c M /trunk/sp.h M /trunk/spm.c M /trunk/spv.c M /trunk/stage2.c M /trunk/test.ecm M /trunk/test.pm1 M /trunk/test.pp1 M /trunk/toomcook.c M /trunk/trial.c M /trunk/tune.c changed address of FSF to new one ------------------------------------------------------------------------ r958 | zimmerma | 2006-12-12 13:32:34 +0100 (Tue, 12 Dec 2006) | 4 lines Changed paths: M /trunk/mpmod.c moved "using special division" from mpmod_init to mpmod_init_base2 (the latter is called directly from ecm.c, thus the message was not displayed, as noticed by Peter Montgomery) ------------------------------------------------------------------------ r957 | kruppa | 2006-10-10 12:45:57 +0200 (Tue, 10 Oct 2006) | 2 lines Changed paths: M /trunk/trial.c Avoid infinite loop in trial division with zero as input number. ------------------------------------------------------------------------ r956 | zimmerma | 2006-10-05 09:18:21 +0200 (Thu, 05 Oct 2006) | 2 lines Changed paths: M /trunk/INSTALL added hint about -max_log2_len option for tune ------------------------------------------------------------------------ r955 | zimmerma | 2006-10-05 09:14:30 +0200 (Thu, 05 Oct 2006) | 3 lines Changed paths: M /trunk/tune.c max_log2_len is now a command-line parameter in tune.c (suggestion from Thomas M.Ott) ------------------------------------------------------------------------ r954 | kruppa | 2006-08-22 12:34:55 +0200 (Tue, 22 Aug 2006) | 3 lines Changed paths: M /trunk/ecm-ecm.h M /trunk/main.c Print first and last ten digits of numbers >1000 digits. Small cleanup in ecm-ecm.h (duplicated #define's). ------------------------------------------------------------------------ r953 | kruppa | 2006-08-06 14:46:56 +0200 (Sun, 06 Aug 2006) | 2 lines Changed paths: M /trunk/TODO Added several ideas ------------------------------------------------------------------------ r952 | kruppa | 2006-08-05 08:56:44 +0200 (Sat, 05 Aug 2006) | 2 lines Changed paths: M /trunk/auxlib.c M /trunk/ecm-impl.h More cleanups of -chkpnt code ------------------------------------------------------------------------ r951 | kruppa | 2006-08-04 17:16:17 +0200 (Fri, 04 Aug 2006) | 4 lines Changed paths: M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/stage2.c Fixed stupid segfault in chkpnt code. Changed checkpoint interval to 10 min. Fixed bug when cleaning up treefiles after receiving signal. ------------------------------------------------------------------------ r950 | kruppa | 2006-08-03 21:10:44 +0200 (Thu, 03 Aug 2006) | 3 lines Changed paths: M /trunk/README M /trunk/auxlib.c M /trunk/ecm-impl.h M /trunk/ecm.1 M /trunk/ecm.c M /trunk/ecm.h M /trunk/ecm.xml M /trunk/factor.c M /trunk/main.c M /trunk/pm1.c M /trunk/pp1.c M /trunk/resume.c M /trunk/stage2.c By popular request: added option to write checkpoints periodically during stage 1. ------------------------------------------------------------------------ r949 | kruppa | 2006-08-03 18:54:47 +0200 (Thu, 03 Aug 2006) | 2 lines Changed paths: M /trunk/ecm-impl.h M /trunk/listz.c M /trunk/stage2.c stage2.c did not #include config.h, so unistd.h was not included either. ------------------------------------------------------------------------ r948 | zimmerma | 2006-07-27 14:16:43 +0200 (Thu, 27 Jul 2006) | 2 lines Changed paths: M /trunk/main.c new size for P-1 champion ------------------------------------------------------------------------ r947 | zimmerma | 2006-07-25 14:10:22 +0200 (Tue, 25 Jul 2006) | 2 lines Changed paths: M /trunk/sp.h added copyright information ------------------------------------------------------------------------ r946 | zimmerma | 2006-07-25 13:29:42 +0200 (Tue, 25 Jul 2006) | 2 lines Changed paths: M /trunk/ecm2.c M /trunk/main.c M /trunk/stage2.c fixed some compiler warnings ------------------------------------------------------------------------ r945 | kruppa | 2006-07-25 12:09:07 +0200 (Tue, 25 Jul 2006) | 4 lines Changed paths: M /trunk/stage2.c Following a comment of P.L.Montgomery, added message when computing product of the F(g_i) at end of stage 2 to avoid unexplained delay for large input numbers. ------------------------------------------------------------------------ r944 | lfousse | 2006-07-19 22:30:59 +0200 (Wed, 19 Jul 2006) | 2 lines Changed paths: M /trunk/stage2.c Include header for `unlink' in stage2.c. ------------------------------------------------------------------------ r943 | zimmerma | 2006-07-17 10:10:51 +0200 (Mon, 17 Jul 2006) | 2 lines Changed paths: M /trunk/main.c new size for P+1 champion ------------------------------------------------------------------------ r942 | zimmerma | 2006-07-07 10:59:37 +0200 (Fri, 07 Jul 2006) | 2 lines Changed paths: M /trunk/factor.c stage1time and use_ntt were not initialized in ecm_init() ------------------------------------------------------------------------ r941 | zimmerma | 2006-05-30 08:56:15 +0200 (Tue, 30 May 2006) | 2 lines Changed paths: M /trunk/main.c changed mininum size of ecm champions ------------------------------------------------------------------------ r940 | lfousse | 2006-05-24 13:28:47 +0200 (Wed, 24 May 2006) | 2 lines Changed paths: M /trunk/runecm2.c Use a *real* dummy domain instead of an existing one for the example email. ------------------------------------------------------------------------ r939 | zimmerma | 2006-05-13 08:19:37 +0200 (Sat, 13 May 2006) | 2 lines Changed paths: M /trunk/runecm2.c changed email ------------------------------------------------------------------------ r938 | zimmerma | 2006-05-05 17:20:28 +0200 (Fri, 05 May 2006) | 2 lines Changed paths: M /trunk/configure.in fixed calls to AC_ARG_ENABLE() ------------------------------------------------------------------------ r937 | lfousse | 2006-05-05 17:08:43 +0200 (Fri, 05 May 2006) | 2 lines Changed paths: M /trunk/configure.in Use proper invocation of AC_ARG_ENABLE for asm-redc. ------------------------------------------------------------------------ r936 | zimmerma | 2006-05-05 17:05:37 +0200 (Fri, 05 May 2006) | 2 lines Changed paths: M /trunk/configure.in M /trunk/ecm.1 M /trunk/ecm.xml changed mailing-list address ------------------------------------------------------------------------ r935 | gaudry | 2006-05-05 16:34:32 +0200 (Fri, 05 May 2006) | 1 line Changed paths: M /trunk/configure.in fixed error message for --enable-asm-redc ------------------------------------------------------------------------ r934 | lfousse | 2006-05-05 16:01:31 +0200 (Fri, 05 May 2006) | 2 lines Changed paths: M /trunk/Makefile.am A /trunk/athlon/Makefile.am M /trunk/configure.in A /trunk/pentium4/Makefile.am A /trunk/x86_64/Makefile.am Use automake's DIST_DIRS feature for a cleaner `dist' target. ------------------------------------------------------------------------ r933 | lfousse | 2006-05-05 15:49:15 +0200 (Fri, 05 May 2006) | 2 lines Changed paths: M /trunk/Makefile.am redc assembly files in toplevel dir are symlinks and should not be part of `dist'. ------------------------------------------------------------------------ r932 | zimmerma | 2006-05-03 12:08:07 +0200 (Wed, 03 May 2006) | 2 lines Changed paths: M /trunk/INSTALL added url of mailing-list archive ------------------------------------------------------------------------ r931 | zimmerma | 2006-05-03 09:36:15 +0200 (Wed, 03 May 2006) | 2 lines Changed paths: M /trunk/Fgw.c M /trunk/configure.in M /trunk/mpzspv.c get rid of valloc ------------------------------------------------------------------------ r930 | zimmerma | 2006-05-02 17:50:04 +0200 (Tue, 02 May 2006) | 2 lines Changed paths: M /trunk/AUTHORS M /trunk/INSTALL M /trunk/Makefile.am M /trunk/README.dev M /trunk/configure.in fixed stupid removal in Makefile.am, prepared for release 6.1.1 ------------------------------------------------------------------------ r928 | zimmerma | 2006-05-01 22:51:53 +0200 (Mon, 01 May 2006) | 2 lines Changed paths: M /trunk/ChangeLog added 'tag' for release 6.1 in ChangeLog Mon May 01 2006 22:49:07 zimmerma -- r927 Released version 6.1. Mon May 01 2006 13:01:15 kruppa -- r923 * trunk/Fgw.c, trunk/main.c, trunk/ecm2.c: modified Small cleanups. Added exit code 143 when exiting due to signal. Sun Apr 02 2006 18:38:08 zimmerma -- r922 * trunk/INSTALL: modified updated to gmp-4.2 and ecm-6.1 Fri Mar 31 2006 13:36:34 zimmerma -- r921 * trunk/TODO: modified added item Wed Mar 22 2006 11:46:18 gaudry -- r920 * trunk/configure.in: modified When asm-redc is enabled, check whether the computer is not too hold, because the default asm code needs at least a PPro or a k7. Fri Mar 17 2006 15:31:58 zimmerma -- r919 * trunk/TODO, trunk/configure.in, trunk/Makefile.am: modified added missing sources in Makefile.am added suggestion from James Wanless Fri Mar 17 2006 14:18:39 zimmerma -- r918 * trunk/README.dev: modified updated ChangeLog instructions Fri Mar 17 2006 13:55:41 zimmerma -- r917 * trunk/ecm.1, trunk/ecm.xml, trunk/AUTHORS, trunk/TODO, trunk/configure.in, trunk/ChangeLog, trunk/README.dev, trunk/NEWS: modified updated NEWS/ChangeLog for 6.0.1 release Fri Mar 17 2006 08:51:14 zimmerma -- r916 * trunk/pm1.c, trunk/pp1.c, trunk/ecm.c, trunk/ecm-impl.h: modified default B2 is less aggressive (exponent 1.43 instead of 1.5) put exponent and costs as macros in ecm-impl.h Tue Mar 14 2006 15:45:26 zimmerma -- r915 * trunk/mpmod.c, trunk/test.ecm: modified fixed bug reported by Allan Steel Fri Mar 10 2006 15:42:07 zimmerma -- r914 * trunk/test.pp1: modified fixed new test to work on 32-bit machine too Fri Mar 10 2006 15:11:54 zimmerma -- r913 * trunk/test.pp1: modified added test case for P+1 bug Fri Mar 10 2006 14:45:29 zimmerma -- r912 * trunk/TODO: modified added workaround Fri Mar 10 2006 14:11:26 zimmerma -- r911 * trunk/TODO: modified added suggestion from Bernstein Tue Mar 07 2006 16:15:54 zimmerma -- r910 * trunk/lucas.c: modified fixed overflow bug in P+1 (unsigned int -> unsigned long) P+1 was probably not working for B1>2^32 on 64-bit machines Tue Mar 07 2006 15:59:57 zimmerma -- r909 * trunk/README: modified changed s into x0 Mon Mar 06 2006 13:49:35 gaudry -- r908 * (MANY FILES) : added * trunk/redc.asm: deleted * trunk/mpmod.c, trunk/ecm.c, trunk/configure.in, trunk/Makefile.am: modified Redc and combined Mul/Redc in asm for different archi (p4, athlon, amd64) Configure.in and Makefile.am modified accordingly. Wed Feb 22 2006 15:58:39 zimmerma -- r907 * trunk/mpmod.c, trunk/schoen_strass.c, trunk/Fgw.c: modified added warnings for uses of _mpz_realloc Wed Feb 22 2006 15:05:45 zimmerma -- r906 * trunk/TODO: modified modified one bug item Thu Feb 16 2006 13:40:16 gaudry -- r905 * trunk/ecm2.c: modified Fixed some memory bugs in multiplyW2n(), that occured in -v -v and/or WANT_ASSERT mode, because it is called with NULL as first arguments. Fixed a double free of variables "coeffs" and a missing free for fd. Wed Feb 15 2006 14:08:12 zimmerma -- r904 * trunk/TODO, trunk/INSTALL: modified modified instructions for gwnum Wed Feb 15 2006 13:50:49 zimmerma -- r903 * trunk/TODO, trunk/stage2.c: modified disable NTT for Fermat numbers Tue Feb 14 2006 10:49:15 zimmerma -- r902 * trunk/TODO: modified added more examples of problems with Fermat numbers Mon Feb 13 2006 13:06:33 zimmerma -- r901 * trunk/TODO: modified added item Thu Jan 19 2006 17:40:59 zimmerma -- r900 * trunk/TODO: modified added item in TODO Mon Jan 16 2006 21:47:11 zimmerma -- r899 * trunk/ecm.c, trunk/lucas.c: modified improved to 17-digit values of decimal constants used in PRAC Sun Jan 15 2006 21:31:16 kruppa -- r898 * trunk/pm1.c, trunk/pp1.c, trunk/ecm.c, trunk/stage2.c, trunk/ecm- impl.h: modified Added some stop_asap() checks to stage 2. Sun Jan 15 2006 09:13:20 zimmerma -- r897 * trunk/TODO: modified added item in TODO (CPUTIME) Fri Jan 13 2006 21:22:14 kruppa -- r896 * trunk/pm1.c, trunk/factor.c, trunk/pp1.c, trunk/ecm.c, trunk/main.c, trunk/ecm-impl.h, trunk/ecm.h: modified Signal handling for P+1 added, save files contain correct B1done value. Stage 2 and cleanup TBD. Thu Jan 12 2006 23:08:21 kruppa -- r895 * trunk/pm1.c, trunk/factor.c, trunk/pp1.c, trunk/ecm.c, trunk/main.c, trunk/configure.in, trunk/ecm.h: modified Add signal handler to exit gracefully. ECM and P-1 stage 1 mostly done, rest TBD. Wed Dec 21 2005 08:36:30 kruppa -- r894 * trunk/schoen_strass.c: modified Bugfix: static mpz_t gt might be used after being mpz_clear()'ed Wed Dec 21 2005 00:38:10 kruppa -- r893 * trunk/schoen_strass.c, trunk/stage2.c: modified Mem leak fix: clear static mpz_t in schoen_strass.c at end of stage 2. Wed Dec 21 2005 00:31:33 kruppa -- r892 * trunk/mpmod.c, trunk/test.ecm, trunk/listz.c, trunk/stage2.c, trunk/ecm2.c, trunk/ecm-impl.h: modified Print stage 2 prime of group order if factor was found and -v -v. Needs more polishing and not tested as much as I'd like yet. Sun Nov 13 2005 09:29:37 kruppa -- r891 * trunk/README, trunk/ecm.1, trunk/ecm.xml, trunk/main.c: modified Added -idlecmd option to pause or quit GMP-ECM when system is busy Sat Nov 12 2005 07:40:08 kruppa -- r890 * trunk/ecm.c: modified Tests in stage 1 if point at infinity is reached and prints message in verbose mode. Nice for finding group order of curves. Fri Nov 11 2005 07:18:51 zimmerma -- r889 * trunk/main.c: modified changed champs information, and updated minimal digit size to get champions Thu Oct 27 2005 11:11:19 zimmerma -- r888 * trunk/ecm.1: modified file generated from ecm.xml, with empty lines manually removed in the last paragraph Thu Oct 27 2005 11:01:53 zimmerma -- r887 * trunk/ecm.xml: modified added line-breaks in AUTHORS section Thu Oct 27 2005 07:54:42 zimmerma -- r886 * trunk/README.dev: modified add hint for autoreconf Wed Oct 26 2005 07:51:56 zimmerma -- r885 * trunk/ecm-impl.h: modified patch for old gcc versions Wed Oct 26 2005 07:01:04 zimmerma -- r884 * trunk/TODO: modified added item Mon Oct 24 2005 17:13:15 zimmerma -- r883 * trunk/ecm.c: modified changed default B2 for ecm Sat Oct 22 2005 15:45:43 zimmerma -- r882 * trunk/AUTHORS: modified added pointer to gforge Thu Sep 29 2005 19:26:35 zimmerma -- r881 * trunk/runecm2.c: modified fixed potential buffer overrun Thu Sep 29 2005 12:08:52 zimmerma -- r880 * trunk/runecm2.c: added contribution from Torbjo"rn Wed Sep 28 2005 13:30:51 kruppa -- r879 * trunk/ecm.c: modified Added warning about prac() bug (calling add3() with identical points) Sat Sep 10 2005 19:56:55 kruppa -- r878 * trunk/README, trunk/main.c: modified Slight cleanup of shell command code, replaced "-prp*" section of README by shellcmd section. Thu Sep 08 2005 19:51:02 kruppa -- r877 * trunk/README, trunk/candi.c, trunk/main.c, trunk/TODO: modified Made -one work better when used with -resume Tue Sep 06 2005 14:02:20 zimmerma -- r876 * trunk/TODO: modified added item Mon Sep 05 2005 12:09:53 zimmerma -- r875 * trunk/Makefile.am: modified added missing entries for "make dist" Thu Sep 01 2005 15:17:44 dnewman -- r874 * trunk/stage2.c: modified Make memory_use take into account sp_F. Thu Sep 01 2005 13:15:42 dnewman -- r873 * trunk/stage2.c, trunk/ecm-impl.h, trunk/ecm_ntt.c, trunk/tune.c: modified Precompute transform of F for use in ntt_PrerevertDivision. Wed Aug 24 2005 13:55:52 zimmerma -- r872 * trunk/check.mpl: modified added add3/duplicate code in Montgomery's coordinates Sat Aug 20 2005 18:41:19 kruppa -- r871 * trunk/TODO: modified Marked -stage1time done, removed shell commands entry (also done) Sat Aug 20 2005 18:37:59 kruppa -- r870 * trunk/Fgw.c, trunk/ecm.c: modified Assume ecm stage 1 always available in GWNUM library, pass error codes from gw_stage_1() back correctly Sat Aug 20 2005 18:16:24 kruppa -- r869 * trunk/test.ecm: modified Some more tests for base 2 numbers, for testing the GWNUM stage 1 Fri Aug 19 2005 21:00:35 kruppa -- r868 * trunk/Makefile.am: modified Fixed filenames in EXTRA_DIST Fri Aug 19 2005 14:23:03 dnewman -- r867 * (MANY FILES) : modified Comprehensive header cleanup; in particular, headers now satisfy their dependencies on other headers. Moved the -n / -nn renicing code from main.c to macros NICE10 / NICE20 in ecm-ecm.h. Added the configure option --enable-memory-debug to conditionally compile memory.c. Fixed some printf format/argument mismatches in tune.c and memory.c. Added some info on NTT and tune to README and documented -no-ntt in ecm.xml. Wed Aug 17 2005 22:47:58 dnewman -- r866 * trunk/mpzspv.c, trunk/TODO, trunk/configure.in, trunk/mpzspm.c, trunk/sp.h, trunk/spv.c, trunk/tune.c: modified Changed mpzspm_t to use spm's instead of __spm_struct's. Added code to use memmove in spv_set. Removed all bovine activity in configure.in. Minor update to TODO. Wed Aug 17 2005 16:11:28 dnewman -- r865 * trunk/schoen_strass.c, trunk/spv.c, trunk/ecm-gmp.h, trunk/tune.c: modified Changed RNG in tune.c to avoid using get_random_ui() as on some platforms (MinGW) it's too slow to be called many times. Fixed declarations of __gmpn_add_nc and __gmpn_mod_34lsub1 which were causing segfaults under Cygwin. Wed Aug 10 2005 12:22:34 dnewman -- r864 * trunk/spm.c, trunk/sp.c, trunk/sp.h, trunk/tune.c: modified Fixed potential problem with spm_init's generation of primitive roots. Tue Aug 09 2005 16:31:34 dnewman -- r863 * trunk/tune.c: modified Changed GRANULARITY to 250ms for more precision. Added a '-v' cmdline option that prints every function evaluation to stderr; also added a TUNE_SLOW define to give possibly more consistent results. Wed Aug 03 2005 20:45:24 dnewman -- r862 * trunk/nbdigits.c: deleted * trunk/auxi.c: modified Rewrote nb_digits() to remove dependency on string.h and FREE(). Deleted an empty file. Wed Aug 03 2005 20:13:48 dnewman -- r861 * trunk/redc.asm: added * trunk/redc.s: deleted * trunk/acinclude.m4, trunk/configure.in, trunk/Makefile.am: modified Imported some more routines from GMP's acinclude.m4 to fix problem with --enable-asm-redc under windows. Renamed redc.s back to redc.asm as it now goes through m4 before the assembler. Fixed a minor issue with architecture detection in configure.in. Wed Aug 03 2005 09:14:19 dnewman -- r860 * trunk/ecm-params.h.athlon, trunk/ecm-params.h.powerpc7450: added * trunk/ecm-params.h.athlonxp, trunk/ecm-params.h.power4: deleted * trunk/configure.in: modified Some tweaks to ecm-params detection. Wed Aug 03 2005 08:42:27 dnewman -- r859 * trunk/config.sub, trunk/configfsf.guess, trunk/config.guess, trunk/configfsf.sub: added Added GMP's finer-grained CPU detection (useful for selecting the right ecm-params.h) Tue Aug 02 2005 18:47:24 kruppa -- r858 * trunk/ecm.1: added Adding man page ecm.1 to CVS (xsltproc/docbook not available everywhere) Tue Aug 02 2005 17:46:09 dnewman -- r857 * trunk/pm1.c, trunk/factor.c, trunk/pp1.c, trunk/bestd.c, trunk/TODO, trunk/ecm.c, trunk/main.c, trunk/configure.in, trunk/stage2.c, trunk/Makefile.am, trunk/ecm-impl.h, trunk/ecm.h, trunk/tune.c: modified Removed the configure option --enable-ntt. Now ntt code is used by default but can be disabled with the command-line option -no-ntt. Lots of changes to function prototypes to accommodate this. Tue Aug 02 2005 16:58:46 kruppa -- r856 * trunk/test.pp1, trunk/test.ecm, trunk/test.pm1: modified Changed syntax of function to sh (Bourne shell) instead of bash Tue Aug 02 2005 16:30:21 dnewman -- r855 * trunk/getprime2.c, trunk/random2.c: deleted * trunk/test.pp1, trunk/random.c, trunk/test.ecm, trunk/Makefile.am, trunk/test.pm1: modified Changed test script shell to /bin/sh as MinGW doesn't have bash. Makefile.am now replaces the getprime2.c and random2.c hacks. Fixed CryptGenRandom() in random.c (maybe) and cleaned up the #includes a bit. Tue Aug 02 2005 15:14:30 kruppa -- r854 * trunk/bestd.c, trunk/ks-multiply.c, trunk/tune.c, trunk/trial.c: modified Fixed compiler warnings with gcc -Wall -W Tue Aug 02 2005 10:06:02 kruppa -- r853 * trunk/ecm.xml: modified Removed -prp* section, added shell commands section. Fixed exit code tables. Mon Aug 01 2005 22:52:06 dnewman -- r852 * trunk/main.c, trunk/trial.c: modified Cleanups to trial.c. Bugfix to probab_prime_p. Mon Aug 01 2005 22:00:03 kruppa -- r851 * trunk/smartprp.c: deleted * trunk/candi.c, trunk/main.c, trunk/configure.in, trunk/Makefile.am, trunk/ecm-ecm.h: modified Removed -prp* options and smartprp.c, added -prpcmd option Mon Aug 01 2005 20:13:11 kruppa -- r850 * trunk/pm1.c, trunk/factor.c, trunk/pp1.c, trunk/random.c, trunk/auxlib.c, trunk/main.c, trunk/ecm.c, trunk/stage2.c, trunk/ecm2.c, trunk/ecm-impl.h, trunk/memory.c, trunk/ecm.h, trunk /ks-multiply.c, trunk/tune.c: modified Added -stage1time option. All time-keeping variables are of type long now. Mon Aug 01 2005 16:45:56 kruppa -- r849 * trunk/mul_fft.c: deleted Not needed for GMP-ECM (part of GMP) Sun Jul 31 2005 18:10:36 kruppa -- r848 * trunk/mpmod.c, trunk/factor.c, trunk/bestd.c, trunk/auxlib.c, trunk/TODO, trunk/main.c, trunk/ecm.c, trunk/listz.c, trunk/stage2.c, trunk/ecm2.c: modified Allocate more memory to mpz_t's in stage 2 to avoid reallocs. More allocation locations tagged for mem leak/realloc debugging. Fixed segfault in stage 2 (if factor found in roots of F) Sun Jul 31 2005 18:00:32 kruppa -- r847 * trunk/memory.c: modified Prints peak memory allocation Sat Jul 30 2005 15:24:49 kruppa -- r846 * trunk/mpmod.c, trunk/median.c, trunk/TODO, trunk/listz.c, trunk/stage2.c, trunk/ecm2.c, trunk/ecm-impl.h, trunk/memory.c, trunk/ks-multiply.c, trunk/ecm-ecm.h, trunk/polyeval.c: modified Some changes to avoid unnecessary reallocs. memory.c can print location of mpz_init() that led to mem leak/realloc (if tagged) Sat Jul 30 2005 15:08:14 kruppa -- r845 * trunk/configure.in: modified Looks for DocBook stylesheets in several directories Sat Jul 30 2005 15:07:08 kruppa -- r844 * trunk/README.lib: modified Added maxmem entry Fri Jul 29 2005 13:50:56 kruppa -- r843 * trunk/random.c: modified Removed leftover debug output Thu Jul 28 2005 23:17:58 kruppa -- r842 * trunk/pm1.c, trunk/factor.c, trunk/random.c, trunk/pp1.c, trunk/ecm.c, trunk/main.c, trunk/ecm.h: modified Seed RNG only once per program invocation. Use GetRandCrypt() under Windows, but untested yet: Wine lacks required dlls. Thu Jul 28 2005 20:55:16 kruppa -- r841 * trunk/pm1.c, trunk/factor.c, trunk/pp1.c, trunk/bestd.c, trunk/ecm.xml, trunk/TODO, trunk/auxlib.c, trunk/ecm.c, trunk/main.c, trunk/stage2.c, trunk/ecm2.c, trunk/ecm-impl.h, trunk/rho.c, trunk/ecm.h: modified Added -maxmem option. Memory estimate not perfectly accurate yet. Sun Jul 24 2005 21:40:14 kruppa -- r840 * trunk/pm1.c, trunk/test.pp1, trunk/pp1.c, trunk/bestd.c, trunk/test.ecm, trunk/ecm.c, trunk/stage2.c, trunk/test.pm1: modified Fixed bug that occurred when B2 < B2min (did a stage 2, but shouldn't) Sun Jul 24 2005 19:51:36 kruppa -- r839 * trunk/Fgw.c, trunk/ecm.c, trunk/main.c: modified Cleanups in Fgw.c Sat Jul 23 2005 23:00:51 kruppa -- r838 * trunk/ecm.xml: modified Added chapter for exit status values Sat Jul 23 2005 21:51:24 kruppa -- r837 * trunk/mpmod.c, trunk/schoen_strass.c, trunk/configure.in, trunk/Makefile.am: modified Correctly aligns GWDATA segment when GWNUM libaray is used. Fixed compilation of tune when using GWNUM. Fri Jul 22 2005 21:14:45 kruppa -- r836 * trunk/Fgw.c, trunk/Makefile.am: modified Speedup for mpz_t <-> gwnum conversion. An elusive bug remains. Thu Jul 21 2005 13:22:12 dnewman -- r835 * trunk/tune2.c: deleted * trunk/ecm-params.h.alpha-ev6, trunk/mpmod.c, trunk/ecm- params.h.default, trunk/TODO, trunk/ecm-params.h.athlonxp, trunk/mul_lo.c, trunk/sp.h, trunk/Makefile.am, trunk/ecm- params.h.athlon64, trunk/ecm_ntt.c, trunk/mpzspv.c, trunk/ntt_gfp.c, trunk/configure.in, trunk/ecm-impl.h, trunk/ecm- params.h.power4, trunk/ecm-params.h.alpha-ev5, trunk/tune.c: modified Added mpn_mul_n tuning to tune.c and erased tune2.c. Moved all the '#ifdef TUNE' blocks to sp.h and ecm-impl.h. Updated ecm-params.*. Fixed minor bug in configure.in. Updated TODO. Wed Jul 20 2005 23:37:17 dnewman -- r834 * trunk/test.pp1, trunk/mpzspv.c, trunk/test.ecm, trunk/auxlib.c, trunk/TODO, trunk/configure.in, trunk/mpzspm.c, trunk/spv.c, trunk/Makefile.am, trunk/test.pm1, trunk/tune.c: modified Complete overhaul of cputime () in auxlib.c, including changes to configure.in. This fixes the mingw issue of cputime () giving calendar time instead of process time. Tested on cygwin, mingw, athlon, athlon 64, alpha. Fixed tune.c to use elltime (). Some portability fixes in spv.c, mpzspv.c. Changed /bin/bash to /bin/sh in the test scripts. Shortened an unnecessarily long expression in mpzspm.c. Updated TODO. Tue Jul 19 2005 16:13:28 dnewman -- r833 * trunk/TODO, trunk/TODO.sp, trunk/stage2.c: modified Adjusted the expected memory calculation to take into account -treefile and NTT memory. Removed some items from TODO, TODO.sp Tue Jul 19 2005 13:49:33 dnewman -- r832 * trunk/tune-ecm_ntt.c, trunk/tune-mpmod.c, trunk/tune-mpzspv.c, trunk/tune-ntt_gfp.c: deleted * trunk/configure.in, trunk/Makefile.am, trunk/tune.c: modified Removed the dirty hack used to compile tune separately from the main code. Fixed a segfault bug in tune.c. Now configure links ecm- params.h.xxx to ecm-params.h (but see the TODO in configure.in) Fri Jul 15 2005 14:59:11 dnewman -- r831 * trunk/ecm-params.h.power4: added * trunk/mpzspv.c, trunk/mpzspm.c, trunk/spv.c: modified Corrected header for malloc() to fix compilation problem on OS X. Added ecm-params.h for the power4 line of cpus. Wed Jul 13 2005 20:53:53 dnewman -- r830 * trunk/ecm-params.h.alpha-ev6, trunk/ecm-params.h.default, trunk /tune-mpmod.c, trunk/ecm-params.h.athlonxp, trunk/ecm- params.h.athlon64, trunk/tune-ecm_ntt.c, trunk/tune-mpzspv.c, trunk /tune-ntt_gfp.c, trunk/ecm-params.h.alpha-ev5: added * trunk/mpmod.c, trunk/sp.h, trunk/Makefile.am, trunk/ecm_ntt.c, trunk/test.pm1, trunk/test.pp1, trunk/mpzspv.c, trunk/ntt_gfp.c, trunk/test.ecm, trunk/ecm-impl.h, trunk/tune.c: modified Added computation of NTT thresholds to tune.c by means of tune-*.c. Added ecm-params target to Makefile.am and ecm-params.h for some architectures. Fixed minor problem in test.* that was causing script errors on alphas. Wed Jul 13 2005 10:35:28 dnewman -- r829 * trunk/ntt_gfp.c, trunk/configure.in, trunk/sp.h, trunk/tune.c: modified Rewrote tune.c to use a function pointer framework for computing thresholds. Removed a redundant line from configure.in. Commented out unused functions in ntt_gfp.c. Tue Jul 12 2005 17:19:19 kruppa -- r828 * trunk/getprime.c, trunk/mpmod.c, trunk/factor.c, trunk/schoen_strass.c, trunk/main.c, trunk/stage2.c, trunk/ecm2.c, trunk/eval.c, trunk/test.pm1, trunk/test.pp1, trunk/pm1.c, trunk/pp1.c, trunk/test.ecm, trunk/ecm.c, trunk/lucas.c, trunk/ecm- impl.h, trunk/ecm.h, trunk/ecm-ecm.h, trunk/tune.c: modified Return code of ecm reflects primality of factor (if any) and cofactor. Renamed MOD_* macros to ECM_MOD_* and moved to ecm.h Mon Jul 11 2005 22:19:32 kruppa -- r827 * trunk/TODO: modified Removed entries for Montgomery roots, moving param selection out of stage 2. Added entries for fixing cputime and RNG seeding under Windows Mon Jul 11 2005 09:48:56 zimmerma -- r826 * trunk/mpmod.c: modified fixed bug in mpmod_init (use of mpz_sizeinbase instead of mpz_size) Sat Jul 09 2005 22:05:42 kruppa -- r825 * trunk/ecm.c, trunk/ecm2.c, trunk/ecm-impl.h: modified Added code for generating roots of F,G for ECM in Montgomery coordinates if S==1. Does now work yet and will probably never be fast. Disabled by default Thu Jul 07 2005 13:07:58 kruppa -- r824 * trunk/mpmod.c, trunk/pm1.c, trunk/ecm-impl.h: modified Fixes bug introduced with negative i0 Wed Jul 06 2005 15:29:12 kruppa -- r823 * trunk/pp1.c, trunk/stage2.c, trunk/ecm-impl.h: modified Fixes bug in P+1 introduced in last update Wed Jul 06 2005 07:34:04 dnewman -- r822 * trunk/mpzspv.c, trunk/TODO.sp: modified Rewrote mpzspv_to_mpzv to use a constant amount of memory, with a small resulting speedup. Updated TODO.sp. Wed Jul 06 2005 06:37:19 dnewman -- r821 * trunk/acinclude.m4: added * trunk/configure.in, trunk/Makefile.am: modified Some improvements to configure.in. Now --enable-redc verifies that the cpu really is a 32-bit x86 (with the help of a macro in acinclude.m4), and the manpage is only compiled if xsltproc and docbook.xsl are present. Tue Jul 05 2005 22:00:03 dnewman -- r820 * trunk/Fgw.c, trunk/TODO, trunk/configure.in, trunk/Makefile.am, trunk/ecm-impl.h: modified Added options --enable-asm-redc and --enable-ntt - now the gwnum, redc.s and ntt code compiles conditionally. Removed corresponding items from TODO, added one more. Tue Jul 05 2005 21:05:43 dnewman -- r819 * trunk/redc.s: added * trunk/redc.asm: deleted Renamed redc.asm to redc.s Mon Jul 04 2005 22:24:06 dnewman -- r818 * trunk/stage2.c: modified Changed how sp_num is displayed when using -v Mon Jul 04 2005 21:13:00 kruppa -- r817 * trunk/mpmod.c, trunk/pm1.c, trunk/schoen_strass.c, trunk/resume.c, trunk/bestd.c, trunk/auxlib.c, trunk/stage2.c, trunk/rho.c, trunk/ecm_ntt.c, trunk/ecm-gmp.h, trunk/ecm-ecm.h, trunk/polyeval.c: modified Some cleanups to avoid compiler warnings Mon Jul 04 2005 20:20:52 dnewman -- r816 * trunk/configure.in: modified Removed AC_FUNC_MALLOC and AC_FUNC_REALLOC from configure.in, see http://lists.gnu.org/archive/html/bug-autoconf/2002-10/msg00075.html Mon Jul 04 2005 19:49:38 dnewman -- r815 * trunk/mpzspv.c, trunk/sp.h, trunk/ecm_ntt.c: modified Added missing config.h include to sp.h. Commented memory usage in many of the ntt functions. Reduced memory usage of mpzspv_normalise, resulting in a speedup. Mon Jul 04 2005 19:43:14 kruppa -- r814 * trunk/ecm2.c: modified Replaced variable length array in multiplyW2n with an mpz_t Mon Jul 04 2005 18:45:57 dnewman -- r813 * trunk/TODO: modified Added estimated memory item to TODO. Mon Jul 04 2005 06:39:03 dnewman -- r812 * (MANY FILES) : modified Replaced snprintf with malloc + sprintf throughout. Fixed incorrect prototype for ceil_log2(). Removed prototype for (static) usage(). Changed #include "gmp.h" to #include throughout. Sun Jul 03 2005 21:33:49 kruppa -- r811 * trunk/pm1.c, trunk/pp1.c, trunk/bestd.c, trunk/ecm.c, trunk/main.c, trunk/listz.c, trunk/stage2.c, trunk/ecm2.c, trunk/ecm-impl.h, trunk/ecm.h: modified Moved selection of dF, k, d1, d2 out of stage 2 so correct parameters can be printed immediately. Print expected number of curves before stage 1. Made dF and k unsigned long throughout. Sun Jul 03 2005 21:12:34 dnewman -- r810 * trunk/mpmod.c, trunk/schoen_strass.c, trunk/mpzspv.c, trunk/random.c, trunk/main.c, trunk/listz.c, trunk/configure.in, trunk/stage2.c, trunk/sp.h, trunk/spv.c, trunk/ecm-impl.h, trunk/ecm_ntt.c, trunk/ks-multiply.c, trunk/ecm-ecm.h, trunk/polyeval.c: modified Preliminary changes to configure script, now configure.in generates config.h. Added some autoconf checks. Renamed some #defines for standardisation purposes. Commented out some unused functions in spv.c. Sat Jul 02 2005 19:47:25 kruppa -- r809 * trunk/TODO: modified Some updates Sat Jul 02 2005 19:35:30 kruppa -- r808 * trunk/pp1.c, trunk/bestd.c, trunk/ecm2.c: modified Code to init roots of G can deal with negative i0 now Sat Jul 02 2005 16:18:28 dnewman -- r807 * trunk/sp.c, trunk/sp.h: modified Added 64-bit primality test for 64-bit machines. Defined UDItype in sp.h for longlong.h (fixes compilation on Athlon 64). Sat Jul 02 2005 15:26:32 kruppa -- r806 * trunk/pm1.c, trunk/pp1.c, trunk/stage2.c, trunk/ecm2.c: modified Cleanup. Removed redundant variable "s", using "i0" instead Sat Jul 02 2005 10:21:56 kruppa -- r805 * trunk/Fgw.c: modified Added dummy function to avoid "empty file" warning. To be fixed, use conditional compilation instead Sat Jul 02 2005 10:09:01 kruppa -- r804 * trunk/stage2.c: modified Removed stray "%" (caused segfault) Sat Jul 02 2005 09:40:34 kruppa -- r803 * trunk/mpzspv.c: modified Added for valloc() etc. Fixes crash on Sparc v9 Fri Jul 01 2005 20:08:28 kruppa -- r802 * trunk/stage2.c: modified Avoid floating point division by 0. when printing expected nr. of curves Thu Jun 30 2005 22:41:30 dnewman -- r801 * trunk/tune2.c, trunk/schoen_strass.c, trunk/TODO, trunk/configure.in, trunk/stage2.c, trunk/mul_lo.c, trunk/sp.h, trunk/ecm-impl.h: modified Added item to TODO. Fixed compilation for when HAVE_NTT not defined. Added AC_C_INLINE to configure.in and changed INLINE to inline everywhere. Wed Jun 15 2005 12:28:17 zimmerma -- r800 * trunk/TODO: modified added new item Mon Jun 13 2005 12:31:58 dnewman -- r799 * trunk/mpzspv.c, trunk/ecm_ntt.c: modified Minor fixes to header inclusion. Thu Jun 09 2005 14:35:21 dnewman -- r798 * trunk/ecm_ntt.c: modified Fixed potential segfault Thu Jun 09 2005 14:00:38 dnewman -- r797 * trunk/mpzspv.c, trunk/ntt_gfp.c, trunk/Fgw.c, trunk/stage2.c, trunk/spv.c, trunk/ecm_ntt.c: modified Removed declaration-within-code ISO-C unorthodoxy Thu Jun 09 2005 13:17:57 zimmerma -- r796 * trunk/stage2.c: modified // -> /* ... */ Thu Jun 09 2005 13:13:49 zimmerma -- r795 * trunk/sp.h: modified removed C++-style comments Thu Jun 09 2005 13:12:34 zimmerma -- r794 * trunk/Makefile.am: modified added longlong.h in noinst_HEADERS Thu Jun 09 2005 12:05:05 dnewman -- r793 * trunk/stage2.c, trunk/ecm-impl.h, trunk/ecm_ntt.c: modified Added -treefile support to ntt_PolyFromRoots_Tree and ntt_polyevalT Tue Jun 07 2005 19:34:46 kruppa -- r792 * trunk/rho.c: modified Minor cleanups (no change in functionality) Tue Jun 07 2005 19:32:43 kruppa -- r791 * trunk/mpmod.c, trunk/Fgw.c, trunk/ecm.c, trunk/ecm-impl.h: modified Interface to Woltman's GWNUM stage 1 for ECM Wed May 18 2005 11:56:07 kruppa -- r790 * trunk/mpzspv.c: modified #ifdef'd an malloc_usable_size() Sun Apr 24 2005 13:40:50 dnewman -- r789 * trunk/mpzspv.c, trunk/configure.in: modified Check for malloc_usable_size in autoconf as not all libc's have it. Sat Apr 23 2005 04:57:32 zimmerma -- r788 * trunk/getprime.c: modified improved getprime main loop Mon Apr 11 2005 16:12:23 dnewman -- r787 * trunk/ntt_gfp.c: modified Changed large length DIT's to use a recursive algorithm, rather than scramble + DIF + scramble. Sun Apr 10 2005 14:50:51 kruppa -- r786 * trunk/TODO: modified Added sliding window multiplication for ECM entry Thu Apr 07 2005 16:07:06 dnewman -- r785 * trunk/mpzspv.c: added * trunk/spm.c, trunk/ntt_gfp.c, trunk/stage2.c, trunk/sp.h, trunk/spv.c: modified Changed mpzspp to mpzspv and mpzp to mpzv. Added mpzspv_verify and lots of mpzspv assertions. Minor speedup by using valloc rather than malloc for sp coeff alignment. Speedup for ntt_PolyFromRoots. Memory reduction and speedup for ntt_polyevalT, also fixed a memory leak. mpzspv_to_mpzv no longer clobbers the input. Minor cosmetic changes. Fixed (probably) and documented the upper bound on sp_num in mpzspm_init. Check for some malloc errors. Thu Apr 07 2005 15:48:30 dnewman -- r784 * trunk/mpzspp.c: deleted * trunk/TODO.sp, trunk/mpzspm.c, trunk/Makefile.am, trunk/ecm-impl.h, trunk/ecm_ntt.c: modified Renamed mpzspp.c to mpzspv.c Tue Apr 05 2005 17:43:10 kruppa -- r783 * trunk/main.c: modified Added -faccmd option, compiled in only if WANT_FACCMD is defined Sat Apr 02 2005 00:40:57 dnewman -- r782 * trunk/stage2.c, trunk/sp.h, trunk/ecm-impl.h, trunk/mpzspp.c, trunk/ecm_ntt.c, trunk/polyeval.c: modified Added a preliminary version of ntt_polyevalT (without treefile support) and a couple of helper routines in mpzspp.c. Minor change to alignment of sp_invF. Un-static'd TUpTree in polyeval.c so ntt_polyevalT can use it. Minor change to thresholds in sp.h. Fri Apr 01 2005 08:15:57 kruppa -- r781 * trunk/NEWS: modified Added news entries for 6.0.1 Thu Mar 31 2005 21:57:41 dnewman -- r780 * trunk/TODO.sp, trunk/stage2.c, trunk/sp.h, trunk/ecm-impl.h, trunk/mpzspp.c, trunk/ecm_ntt.c: modified Adjusted functions in ecm_ntt.c so mpzspm_init is now only called once (ever). Abandoned the "automatic transform" idea. Rewrote ntt_PolyInvert and saved a transform per level. Rewrote ntt_PrerevertDivision to use a cached transform of 1/F. Fixed potential bug in mpzspp_normalise. Updated TODO.sp Thu Mar 31 2005 21:50:35 kruppa -- r779 * trunk/INSTALL: modified Added detail to, removed typo from Win install instruction Thu Mar 31 2005 19:35:36 fousse -- r778 * trunk/INSTALL: modified Typo. Thu Mar 31 2005 19:22:16 kruppa -- r777 * trunk/INSTALL: modified Added install instructions for Windows/MinGW Wed Mar 30 2005 15:53:29 kruppa -- r776 * trunk/README: modified Updated Note on ECM extra smoothness Wed Mar 30 2005 15:34:47 kruppa -- r775 * trunk/rho.c: modified Remove GSL dilog_series code (is GPL, not LGPL). Changed EXTRA_SMOOTHNESS to 23.4 (Montgomery's value) Mon Mar 28 2005 18:07:54 kruppa -- r774 * trunk/TODO: modified Added shell command on event entry Mon Mar 28 2005 18:05:52 kruppa -- r773 * trunk/main.c: modified Fix segfault when parsing B2, work around MinGW scanf() bug Mon Mar 28 2005 18:04:57 kruppa -- r772 * trunk/stage2.c, trunk/rho.c: modified Free rhotable memory at end of stage 2 Wed Mar 23 2005 19:45:30 zimmerma -- r771 * trunk/ecm-impl.h: modified added missing macro Wed Mar 23 2005 19:36:35 zimmerma -- r770 * trunk/pp1.c, trunk/ecm.c, trunk/mul_lo.c: modified added comments and normalization when FULL_REDUCTION is not defined Wed Mar 23 2005 17:07:25 kruppa -- r769 * trunk/TODO: modified Added negative i0, composite d2, avoiding reallocs, getting bestD() out of stage 2. Added release targets for some entries. Tue Mar 22 2005 15:02:43 zimmerma -- r768 * trunk/bestd.c, trunk/TODO: modified added one TODO item added one dF value (600600) Mon Mar 14 2005 01:24:18 dnewman -- r767 * trunk/ntt_gfp.c, trunk/sp.h, trunk/spv.c, trunk/mpzspp.c, trunk/ecm_ntt.c: modified Started work on a framework for transform caching - now mpzspp's should automatically cache transforms and normalise. Added mpzspp_[to/from]_ntt functions to facilitate this and help simplify ntt_PrerevertDivision. Fixed an incorrect assertion in spv_mul. Sat Mar 12 2005 19:24:44 dnewman -- r766 * trunk/sp.h, trunk/mpzspp.c, trunk/ecm_ntt.c: modified mpzspp_t's are now passed by reference instead of by value. Tue Mar 08 2005 17:22:18 dnewman -- r765 * trunk/listz.c: modified Fixed a bug in list_neg Tue Mar 08 2005 13:50:34 zimmerma -- r764 * trunk/listz.c: modified fixed potential bug in list_neg Tue Mar 08 2005 13:35:27 dnewman -- r763 * trunk/ecm.xml: modified Corrected Dave Newman's email address from firstname.lastname@... to david.lastname@... Tue Mar 08 2005 01:21:37 dnewman -- r762 * trunk/sp.h, trunk/ecm_ntt.c: modified Unrolled recursion in ntt_PolyInvert and added a DEBUG block to verify it gives the right answer. Adjusted NTT_POLYINVERT_THRESHOLD in sp.h Mon Mar 07 2005 11:57:47 dnewman -- r761 * trunk/sp.h, trunk/ecm_ntt.c: modified Rewrote ntt_PolyInvert. This seems to stop the "Found input number N" errors. Adjusted POLYINVERT_NTT_THRESHOLD accordingly. Sun Mar 06 2005 19:21:41 kruppa -- r760 * trunk/ecm.xml: modified Small corrections, Sch"onhage now spelled with oe. Sun Mar 06 2005 13:29:07 zimmerma -- r759 * trunk/TODO, trunk/configure.in: modified changed version to 6.1 added item in TODO Sat Mar 05 2005 21:29:25 zimmerma -- r758 * trunk/resume.c, trunk/main.c, trunk/eval.c, trunk/ecm.h: modified patch for Apple ('\r' instead of '\n' for newline) Fri Mar 04 2005 20:46:07 kruppa -- r757 * trunk/README.lib, trunk/main.c, trunk/INSTALL: modified Small correction to --help output (missing abs bars in -base2 option) Fri Mar 04 2005 13:05:24 kruppa -- r756 * trunk/Fgw.c: modified Added copyright notice for GWNUM based code Fri Mar 04 2005 12:33:06 dnewman -- r755 * trunk/spm.c, trunk/ntt_gfp.c, trunk/sp.c, trunk/longlong.h, trunk/mpzspm.c, trunk/sp.h, trunk/spv.c, trunk/ecm_ntt.c, trunk/mpzspp.c: modified added licence headers Fri Mar 04 2005 12:07:47 dnewman -- r754 * trunk/ecm-impl.h: modified added ecm-specific ntt functions Fri Mar 04 2005 11:51:38 dnewman -- r753 * trunk/stage2.c: modified added HAVE_NTT option to enable sp code, changed dF to be always a power-of-two Fri Mar 04 2005 11:49:40 dnewman -- r752 * trunk/spv.c: added added spv.c Fri Mar 04 2005 11:49:29 dnewman -- r751 * trunk/spm.c: added added spm.c Fri Mar 04 2005 11:49:04 dnewman -- r750 * trunk/ntt_gfp.c: added added ntt_gfp.c Fri Mar 04 2005 11:48:47 dnewman -- r749 * trunk/mpzspp.c: added added mpzspp.c Fri Mar 04 2005 11:48:29 dnewman -- r748 * trunk/mpzspm.c: added added mpzspm.c Fri Mar 04 2005 11:48:08 dnewman -- r747 * trunk/longlong.h: added added longlong.h (copy from gmp-4.1.4) Fri Mar 04 2005 11:47:33 dnewman -- r746 * trunk/listz.c: modified removed static from list_mul and list_neg so the sp code can use them Fri Mar 04 2005 11:46:23 dnewman -- r745 * trunk/ecm_ntt.c: added added ecm_ntt.c Fri Mar 04 2005 11:46:02 dnewman -- r744 * trunk/TODO.sp: added added TODO.sp Fri Mar 04 2005 11:45:34 dnewman -- r743 * trunk/Makefile.am: modified added sp sources Fri Mar 04 2005 10:39:30 dnewman -- r742 * trunk/sp.c: added added sp.c Fri Mar 04 2005 10:39:12 dnewman -- r741 * trunk/sp.h: added added sp.h Fri Mar 04 2005 09:11:07 zimmerma -- r740 * trunk/README: modified added note about GMP thresholds Wed Mar 02 2005 22:46:58 zimmerma -- r739 * trunk/tune2.c: modified added ECM_STDOUT, ECM_STDERR Wed Mar 02 2005 22:06:00 zimmerma -- r738 * trunk/schoen_strass.c, trunk/random.c, trunk/main.c, trunk/listz.c, trunk/stage2.c, trunk/ecm2.c, trunk/eval.c, trunk/ks-multiply.c, trunk/polyeval.c: modified small changes for _MSC_VER prints memory usage (with -v) prints argv[0] instead of "ecm" in case of error Wed Mar 02 2005 22:02:34 zimmerma -- r737 * trunk/TODO: modified added item (-maxmem) Wed Mar 02 2005 22:02:06 zimmerma -- r736 * trunk/README: modified added "10. Known problems" Wed Mar 02 2005 22:01:17 zimmerma -- r735 * trunk/AUTHORS: modified added beta-testers Wed Mar 02 2005 21:30:08 kruppa -- r734 * trunk/README: modified Mention default S for Fermat numbers (S=1 or 2). "he thinks" -> "it thinks". Wed Mar 02 2005 12:37:53 zimmerma -- r733 * trunk/main.c: modified put back "Run xxx out of yyy" in loop mode Wed Mar 02 2005 12:24:42 zimmerma -- r732 * trunk/TODO: modified added new item Tue Mar 01 2005 21:52:27 kruppa -- r731 * trunk/rho.c: modified Fixed array out-of-bounds access, reported by Jon Becker Tue Mar 01 2005 21:51:31 kruppa -- r730 * trunk/b1_ainc.c, trunk/pm1.c, trunk/pp1.c, trunk/ecm.c: modified Calculation of default B2 in ecm(), pm1() or pp1() would overwrite caller's B2min, B2 (mpz_t), using local copies now. Auto-increment returns rounded B1. Fixes bug reported by "Phil MjX" on mersenneforum.org Tue Mar 01 2005 10:19:55 zimmerma -- r729 * trunk/test.ecm: modified simplified test so that it takes less time (overall gain of about 3) Tue Mar 01 2005 08:30:01 zimmerma -- r728 * trunk/mpmod.c, trunk/ecm.c, trunk/tune.c: modified fix to link problem for "tune" Mon Feb 28 2005 16:07:28 kruppa -- r727 * trunk/stage2.c, trunk/configure.in, trunk/polyeval.c: modified Checks for snprintf(), if not available falls back to sprintf() Mon Feb 28 2005 13:11:10 kruppa -- r726 * trunk/Fgw.c: modified Removed original gwnum dword->fft conversion code Mon Feb 28 2005 11:40:55 zimmerma -- r724 * trunk/ChangeLog, trunk/README.dev: modified updated ChangeLog/README.dev 2005-02-28 paul * release version 6.0 * ecm.xml: updated -primetest 2005-02-27 alex * pm1.c, pp1.c, ecm.c: For Fermat numbers, S=1 (S=2 for P-1) is now default * Makefile.am, configure.in, ecm-impl.h, main.c, mpmod.c, schoen_strass.c, Fgw.c: More changes for linking gwnum 2005-02-24 alex * Fgw.c, configure.in: Changes for linking gwnum (incomplete) * main.c: Added option -h for getting help * ecm.xml: More cleanup 2005-02-23 paul * auxlib.c, ecm-impl.h, ecm.c, ecm2.c, ks-multiply.c, pm1.c, pp1.c, stage2.c, tune.c, tune2.c: hopefully fixed the wrap-around bug of cputime() * INSTALL, NEWS, README, configure.in: changed version to 6.0 removed "Known problems" in README (were obsolete) * random2.c, smartprp.c, trial.c, auxi.c, b1_ainc.c, candi.c, eval.c, getprime2.c, main.c: fixed copyright years/names 2005-02-23 alex * ecm.xml: Some inconsistent typesetting fixed * ecm.xml: added -base2, -timestamp, -savea, different loglevels 2005-02-23 paul * ChangeLog, Makefile.am, README, README.lib, auxi.c, configure.in, ecmfactor.c, factor.c, pp1.c, smartprp.c, tune2.c: gmp -> GMP fixed one memory leak added missing (L)GPL headers 2005-02-22 paul * Makefile.am, README: forgot to install ecm.h * AUTHORS, ChangeLog, INSTALL, Makefile.am, NEWS, README, README.lib, TODO, ecm-ecm.h, ecm-impl.h, ecm.xml, eval.c, mpmod.c, mul_lo.c, polyeval.c, tune2.c: changes after remarks from Torbjo"rn on rc4 * INSTALL, Makefile.am, auxi.c, candi.c, eval.c, main.c, memory.c: don't install tune*, ecmfactor fixed problems with -DMEMORY_DEBUG 2005-02-22 paul * INSTALL, Makefile.am, auxi.c, candi.c, eval.c, main.c, memory.c: don't install tune*, ecmfactor fixed problems with -DMEMORY_DEBUG 2005-02-22 alex * stage2.c: bestD() overwriting original B2 caused B2 to keep ever growing. Fixed * README, bestd.c: Corrected computation of effective B2 value in bestD(). It did not account for rounding up due to integer block size. This changes computed probabilities, table in README updated accordingly * main.c: Added option -timestamp * INSTALL: On SunOS, the compiler and flags must be specified at configure time, or testing for GMP lib/header version match fails * TODO: Removed -base2 entry, it's done 2005-02-22 laurent * Makefile.am: Distribute ecm.xml too. 2005-02-22 paul * factor.c, main.c: fixed memory leaks * listz.c: moved declaration outside a #ifdef 2005-02-22 alex * bestd.c, rho.c, stage2.c, README: bestD() computes new effective B2 value. ecmprob() now assumes group order divisible by 24 on average (README updated). * auxlib.c: outputf() was missing the va_end(). Not sure if it is really needed (i.e. to avoid memleaks), but it's cleaner to have it 2005-02-22 paul * TODO, auxlib.c: fixed problem under MinGW (thanks Japke Rosink) 2005-02-22 alex * main.c: Added #include , missing it causes segfault on Amd64 because sizeof(int) != sizeof(char*) 2005-02-22 paul * ecmprob.magma: added some results 2005-02-21 alex * stage2.c, main.c, pm1.c, pp1.c, ecm.c, ecm.h, factor.c, bestd.c, ecm-impl.h: Changed B2, B2min to mpz_t, can now be arbitrarily large so long as B2-B2min < ~1e24. 2005-02-19 paul * pm1.c, pp1.c: put back x0=... in normal verbose mode (P-1/P+1) 2005-02-19 alex * ecm2.c, main.c, mpmod.c, pm1.c, pp1.c, ecm-impl.h, ecm.c: Added option -base2. Added option -savea (appending to save files). Fixed bug in ecm2.c multiplyWn() debugging code * pp1.c, stage2.c, pm1.c, ecm.c, ecm-impl.h: Added another verbosity level: RESVERBOSE, enabled with "-v -v". Prints intermediate residues (after stage 1 etc), mere "-v" does not anymore 2005-02-18 paul * pp1.c: fixed problem with |S|=1 (was: pp1_mul did modify the exponent passed) 2005-02-18 alex * pp1.c: A bug for |S| > 1 fixed, |S| == 1 still mysteriously broken * stage2.c, bestd.c, ecm-impl.h, ecm.h, ecm2.c, main.c, pm1.c, pp1.c: Using mpz_t for i0, s and related vars. Eliminates most overflow conditions. Currently, P+1 with S=1 does not work correctly, to be fixed later 2005-02-17 alex * ChangeLog: Added treefile, double sieve, new ECM root generation that reduces extgcds * TODO: treefile is done, removed 2005-02-17 paul * ecm-impl.h, ecm.c, pp1.c, ChangeLog, TODO: changed default poly. choice in P+1 (now same as in ECM) 2005-02-17 alex * stage2.c: Fixed segfault: n could get mpz_clear'd without having been inited Fixed overflow condition in fin_diff_coeff() by using mpz_t's. Makes test.pp1 work with |S| > 1. Small comment changes. * pm1.c: Changed printf to outputf, #ifdef DEBUG to WANT_ASSERT and exit() to return with error status 2005-02-17 paul * ecmprob.magma, pm1.c, pp1.c, stage2.c, ecm-impl.h, ecm.h: finished Brent-Suyama's extension for P+1 2005-02-16 paul * ecm-impl.h, ecm2.c, pp1.c, stage2.c: Brent/Suyama for P+1 (cont'd) 2005-02-16 alex * ecm2.c: Replaced printf by outputf 2005-02-16 paul * stage2.c, ecm-impl.h, ecm2.c, pp1.c: preliminary code for Brent/Suyama's extension in P+1 2005-02-15 alex * ecm.c: Increased thresholds for high Brent-Suyama degrees * README: A small correction about mem saving with treefiles, and typo * ecm-impl.h, pp1.c, stage2.c, bestd.c: My latest attempt to get bestD() right: new conditions for i0 and i1, and fixed init of progressions if i0 != 0 (mod d2) 2005-02-14 alex * polyeval.c, stage2.c: Small cleanups of treefile code 2005-02-13 alex * README: Added description of -treefile, plus some small changes 2005-02-11 paul * random.c: commented the outputf calls in non-library mode 2005-02-11 alex * bestd.c, random.c, stage2.c, TODO: Small changes to use ECM_ERROR and outputf(), updated TODO 2005-02-11 paul * polyeval.c: removed unused variable 2005-02-11 alex * pp1.c, stage2.c, main.c, pm1.c, polyeval.c, ecm.h, factor.c, listz.c, ecm-impl.h, ecm.c: Storing product tree of F in files pretty much works now. Some cleanups tbd 2005-02-10 alex * test.ecm: Small comment change 2005-02-09 paul * pp1.c, stage2.c, README.lib, ecm.c, ecm.h, factor.c, main.c, pm1.c: updated README.lib now use the library function ecm_factor() in main.c too! 2005-02-09 alex * schoen_strass.c: Changes for library, uses ASSERT and outputf() now 2005-02-08 paul * Makefile.am, README, auxlib.c, bestd.c, ecm-impl.h, ecm2.c, ecmfactor.c, main.c, mul_lo.c, trial.c, tune.c, tune2.c: further cleanup * README.lib: documentation for libecm * pm1.c, pp1.c, stage2.c, factor.c, listz.c, main.c, mpmod.c, ecm-impl.h, ecm.c, ecm.h, ecm2.c, auxlib.c: got rid of verbose parameter through outputf() interface * random.c, ecm.h, ecm2.c, ecmfactor.c, factor.c, main.c, pm1.c, pp1.c, auxi.c, ecm-ecm.h, ecm-impl.h, ecm.c: further work for library interface * mul_lo.c, stage2.c, main.c: fixed problems to please icc * test.pm1: added test for step 2 primes near B2min * Makefile.am, ecm-ecm.h, ecm-impl.h, ecm.c, ecm.h, factor.c, main.c, pm1.c, pp1.c, resume.c, stage2.c: preliminary new file factor.c for library interface 2005-02-07 alex * rho.c: Changed error testing/handling to ASSERT. 2005-02-07 paul * auxlib.c, bestd.c, ecm-impl.h, ecm2.c, getprime.c, ks-multiply.c, mpmod.c, rho.c, stage2.c, toomcook.c: got rid of xmalloc() [now allocation errors are signaled to caller] replaced abort, exit, stdout, stderr, printf, gmp_printf... * schoen_strass.c, configure.in: check for __gmpn_mod_34lsub1 in configure, and wrote a replacement for it when it does not exist * auxlib.c: use ECM_STDOUT/ECM_STDERR * auxi.c, ecm-ecm.h: removed gcd in auxi.c, ecm-ecm.h * ks-multiply.c, listz.c, lucas.c, median.c, mpmod.c, mul_lo.c, pm1.c, polyeval.c, pp1.c, rho.c, schoen_strass.c, stage2.c, tune2.c, TODO, auxlib.c, bestd.c, ecm-ecm.h, ecm-impl.h, ecm.c, ecm2.c, getprime.c: removed FILE arguments 2005-02-06 alex * ecm-ecm.h, ecm-impl.h, ecm.c, main.c, stage2.c, auxlib.c: Encapsuled output control in outputf() function, made stage2.c and ecm.c use that function * rho.c: For B1=1, computation would go into very long loop. Fixed * stage2.c: Removed experimental SAVE_TREE code, put back in for pow() etc. 2005-02-05 alex * rho.c, stage2.c: Got rid of finite() in stage2.c, made sure ecmprob() does not return negative values 2005-02-04 paul * TODO: put proposal from Karim Belabas * schoen_strass.c, stage2.c, toomcook.c, mul_lo.c, polyeval.c, rho.c, median.c, listz.c, ks-multiply.c, TODO, ecm-impl.h: more changes to catch errors and specify output/error streams * ecm-impl.h, stage2.c, test.pm1: changed one P-1 test (B2 was too near from 2^53, so that B2' overflowed) 2005-02-04 alex * bestd.c: Fixed bug in bestd() ...again (following PaulZ's suggestion) 2005-02-04 paul * bestd.c, mul_lo.c: added FIXME in bestd.c changed mul_lo.c to use table computed by tune2.c * tune2.c: fixed pb with return value * median.c, mpmod.c, pm1.c, polyeval.c, pp1.c, stage2.c, ecmfactor.c, getprime.c, listz.c, lucas.c, main.c, ecm2.c, ecm-impl.h, ecm.c, ecm.h: put back changes from Alex (rev 1.57 to 1.58 of pm1.c) that I removed by error added control for output/error streams changed exit(1) into error return values 2005-02-03 paul * TODO, configure.in: updated TODO removed useless message in configure 2005-02-03 alex * ecm-impl.h, ecm2.c, mpmod.c, pm1.c, pp1.c: Replaced UNUSED by ATTRIBUTE_UNUSED, following GMP convention (see gmp-impl.h) * TODO: Montgomery's double sieve is pretty much implemented now * bestd.c: Fixed bug in bestd() which sometimes caused too small d values to be chosen, thus failing to test entire [B2min,B2] interval * auxlib.c: SunOS needs time.h for CLOCKS_PER_SEC 2005-02-03 paul * pm1.c, polyeval.c, pp1.c, random.c, rho.c, schoen_strass.c, stage2.c, toomcook.c, mpmod.c, mul_lo.c, ecm2.c, getprime.c, ks-multiply.c, listz.c, lucas.c, median.c, auxlib.c, bestd.c, ecm.c: changed header from GPL to LGPL added missing years if any * ecmfactor.c: improved readability 2005-02-03 laurent * median-aux.c: Removed unused file median-aux.c. * ecm.xml: ecm.xml validates again. 2005-02-03 paul * Makefile.am: added missing headers * getprime.c: added main() to compile with -DMAIN * countsmooth.c: added missing #ifndef and url for primegen 2005-02-03 alex * schoen_strass.c: Made F_mod_*() static, hopefully to please icc 2005-02-03 paul * auxlib.c, ecm-ecm.h, ecm-impl.h, ecmfactor.c, getprime2.c, nbdigits.c, random.c, random2.c: new files for library * stage2.c, toomcook.c, trial.c, tune.c, tune2.c, smartprp.c, resume.c, rho.c, schoen_strass.c, mul_lo.c, pm1.c, polyeval.c, pp1.c, ks-multiply.c, listz.c, lucas.c, main.c, median.c, mpmod.c, ecm.h, ecm2.c, eval.c, getprime.c, bestd.c, candi.c, configure.in, countsmooth.c, ecm-gmp.h, ecm.c, Makefile.am, auxi.c, b1_ainc.c: complete rewrite to separate library/frontend * test.pp1, check.mpl, ecm.h, listz.c, stage2.c: modified one P+1 test * ecm-gmp.h, pm1.c, tune.c, tune2.c: fixed warnings (with -W -Wall -pedantic -Wmissing-prototypes) 2005-02-02 alex * ecm2.c, pm1.c, stage2.c, ecm.h: Cleanup of ecm/pm1_rootsF code. Small speedup in pm1_rootsF init. 2005-02-02 paul * Makefile.am, ecm.h, mul_lo.c, tune.c, tune2.c: added new file tune2.c for tuning mpn_mul_lo_n() * ecm-gmp.h: added missing MPN_OVERLAP_P * ecm.xml: changed email for Jim * AUTHORS: added item for Jim * AUTHORS: mungled email addresses * README, configure.in, ecm-gmp.h, ecm.h, ks-multiply.c, mpmod.c, mul_lo.c, pm1.c, tune.c: added --enable-assert to configure got rid of WANT_GMP_IMPL * README, ecm.xml: added -prp options in ecm.xml cleaned README * main.c, pm1.c, stage2.c, eval.c: more cleanup * README, bestd.c, countsmooth.c, ecm.c, ecm.h, eval.c, getprime.c, main.c, mpmod.c, pm1.c, pp1.c, trial.c: more cleanup (fixed icc warnings) * Makefile.am, README, TODO, configure.in, ecm.xml, mpmod.c: updated README added target tune in Makefile 2005-02-01 alex * pp1.c, stage2.c, ecm.h: Printing of mul count in rootsF/G for P+1 put back in * ecm2.c: Better selection of number of parallel progressions in ecm_rootsF(), avoids unreasonably long init times 2005-02-01 paul * main.c, pm1.c, pp1.c, test.ecm, README, candi.c, ecm.c, ecm.h: made -go work for P+1 and ECM 2005-02-01 alex * pm1.c: Merge errors. :( Fixed * ecm2.c, pm1.c: Put counting of muls/extgcds when computing roots of F/G back in. Only printed with "-v -v" (or more -v) 2005-02-01 paul * ChangeLog, Makefile.am, README.dev, TODO, configure.in, ecm.xml, main.c: updated TODO, ChangeLog, ecm.xml * AUTHORS: obscured my email 2005-02-01 alex * AUTHORS: Updated (and obscured) my email address 2005-02-01 paul * configure.in: avoid AC_CHECK_LIB 2005-02-01 alex * INSTALL: Minor corrections 2005-02-01 paul * configure.in: check mpn_mul_fft *after* LDFLAGS/LIBS are defined * TODO, ecm.c, ecm.h, main.c, pm1.c, pp1.c, stage2.c, ChangeLog, README: removed -ticdelay option (was unsupported) * INSTALL, Makefile.am, ecm.xml: added man page (ecm.1) in distribution added install instruction in INSTALL * auxi.c: added missing years * ecm.xml: added missing * INSTALL, ecm.xml: modified INSTALL for new configure/make 2005-01-31 paul * bestd.c, ecm.h, ks-multiply.c, listz.c, median.c, resume.c, toomcook.c: code cleanup (removed muls count, unused code, ...) * AUTHORS, README, ecm.xml, ks-multiply.c, listz.c: improved documentation 2005-01-31 alex * resume.c: Fixed possible unterminated string 2005-01-31 paul * Makefile.am, README.dev, configure.in: simplified LDFLAGS/LIBS * README.dev, ecm.1, ecm.xml: source documentation is now ecm.xml (docbook) * main.c: updated champion's digits 2005-01-28 paul * test.ecm: removed last line 2005-01-28 laurent * configure.in: Configure.in needs to set LDFLAGS. 2005-01-28 paul * ks-multiply.c, listz.c, ecm.h: fixed bug (MPN_COPY requires size > 0) * ks-multiply.c, test.ecm, test.pm1, test.pp1: check non-zero size in MPN_COPY 2005-01-28 laurent * ecm.h, main.c, resume.c: VERSION is defined in configure.in now. 2005-01-28 paul * pp1.c, schoen_strass.c, stage2.c, Makefile.am, README, TODO, bestd.c, configure.in, ecm.c, ecm.h, ecm2.c, eval.c, listz.c, lucas.c, main.c, mul_lo.c, ntl.c, pm1.c, polyeval.c, polyz.c: removed counting of multiplications removed use of NTL updated TODO 2005-01-27 paul * AUTHORS, ks-multiply.c: filled AUTHORS file * main.c: typo * README.dev: need aclocal too * Makefile: removed -> switch to autotools * AUTHORS, NEWS, README.dev: new files for autotools * ecm2.c, ks-multiply.c, listz.c, stage2.c: fixed a few memory leaks 2005-01-27 alex * pm1.c: Fixed mem leak (pointed out by PaulZ) 2005-01-26 paul * INSTALL, README, b1_ainc.c, main.c: updated README * ChangeLog, mpmod.c: modified ChangeLog added special base2mod for Fermat numbers 2005-01-26 laurent * Makefile.am, configure.in: Rules cleaning. * Makefile.am, configure.in: More autotools voodoo, borrowed from MPFR. 2005-01-26 paul * listz.c, mpmod.c, pm1.c, schoen_strass.c, INSTALL: further cleaning of "unused" variables * bestd.c, ecm.c, ecm.h, listz.c, main.c, pm1.c, pp1.c: removed unused code and variables 2005-01-25 paul * bestd.c, ecm.h, ks-multiply.c, listz.c: fixed a problem in bestD changed version to 5.2.0 implemented wrap-around trick in division 2005-01-17 laurent * Makefile.am: Add missing source file. * bestd.c: No need to bother with values.h, float.h is enough and seems to work just as well. 2005-01-15 laurent * auxi.c: Correct call to getrusage. 2005-01-14 alex * ecm.h, pm1.c, pp1.c, stage2.c, ecm.c: Expected time to find a factor printed at end of stage 2 2005-01-07 paul * redc.asm: added copyright header * mpmod.c, redc.asm, Makefile: added assembly support for redc 2005-01-05 paul * mpmod.c: renamed ecm functions with mpz_ or mpn_ prefix 2005-01-04 paul * mpmod.c: improved mpz_mod_n 2004-10-24 alex * rho.c: Added missing prototypes * ecm.h, rho.c, stage2.c, Makefile: Dickman's rho function for computing ECM's probability of success. Is printed for various factor sizes with -v parameter. 2004-10-12 alex * ecm-gmp.h: Replaced #if ... /#else #if ... by /#elif ... * schoen_strass.c: Fixed checksum debugging code for NOPAD case 2004-10-06 paul * mul_fft.c: this is the mul_fft.c code from GMP (gmp-cvs-20040917 patched) * main.c: changed default k to 2 2004-09-29 paul * ks-multiply.c, listz.c, stage2.c: added wrapmul in ks-multiply.c (not used so far) removed check for number of multiplies in stage2.c (not possible with KS) 2004-09-28 alex * stage2.c: Print final residue (product of polyeval output) if verbosity >= 3, i.e. "-v -v" * schoen_strass.c, mpmod.c, Fgw.c, Makefile, ecm.h: Interface to use George Woltman's gwnum library for Fermat numbers 2004-09-23 alex * mpmod.c: Stupid bug: = instead of == in comparison. :( 2004-09-21 paul * mpmod.c: fixed misusage of mpn_mul_fft() 2004-09-21 alex * ecm.h, mpmod.c, schoen_strass.c: Use of GMP FFT in mpmod.c self-contained now, and used only for exponent >=32768. Use of GMP FFT added to schoen_strass.c, but currently disabled - misses a factor of F15. * README: Small changes to Note on Fermat numbers section 2004-09-21 paul * mpmod.c: with HAVE_FFT defined, mpres_mul() directly calls mpn_mul_fft() * README: removed one sentence 2004-09-13 alex * README: Some changes to Brent-Suyama section. Added note on factoring Fermat numbers. 2004-09-13 paul * main.c: disable Line=... messages by default * ks-multiply.c: fixed bug when m=0 * Makefile: added CFLAGS to LD (needed on Sparc) 2004-09-11 alex * schoen_strass.c, listz.c: PolyInvert() now uses a middle product for Fermat numbers * schoen_strass.c, stage2.c: Moved global var Fermat from stage2.c to schoen_strass.c 2004-09-10 alex * polyeval.c: Oops - accidentally deleted a list_mod() statement. Fixed. * polyeval.c, bestd.c: polyeval_tellegen() still used a short product instead of Schoenhage-Strassen for Fermat numbers! Fixed, approx. doubles speed for POLYEVAL and mul count matches theory now. * median.c: Additional paramter check for Fermat case of KMulGen() * ecm.h: KS_MULTIPLY was accidentally #undef'd, now #define'd again * median.c: TMulGen would call TMulKS() even if KS_MULTIPLY was undef'd. Fixed. 2004-09-09 alex * bestd.c, ecm.h, pp1.c, stage2.c: Double sieve for P+1 (does not use multiple progressions, only skips) bestD() takes d2 into account. 2004-09-09 laurent * configure.in: Verify that GMP was compiled with FFT enabled. * Makefile.am, configure.in: Actually test for GMP presence in configure. Support for test targets in Makefile.am. 2004-09-09 paul * median.c, test.pm1, ecm.h, ks-multiply.c, listz.c, main.c, bestd.c: use middle product in PolyInvert changed bestD() for new KS routines now TMulKS takes an additional parameter "rev" 2004-09-09 alex * mpmod.c, pm1.c, stage2.c, test.ecm, bestd.c, ecm.h, ecm2.c: Double sieve for generating roots of ECM and P-1, see Montgomery "Speeding", section 9. P+1 tbd. bestD() does not know about larger block length yet, tbd. 2004-09-09 paul * ks-multiply.c: malloc -> xmalloc * ks-multiply.c: fixed malloc bug * trial.c, auxi.c: fixed warning 2004-09-08 alex * listz.c: added #else to avoid warning about unreachable code by Sun CC. 2004-09-08 paul * polyeval.c, ecm2.c, ks-multiply.c, median.c, ecm.h, ecm-gmp.h: added TMulKS() [and macro FFT_WRAP, and KS_TMUL_THRESHOLD] moved ASSERT() to ecm.h fixed typo in ecm2.c * ecm.h, listz.c, median.c, polyeval.c: added list_swap fixed inefficiency in polyeval_tellegen (list_mul_high call did not reduce coefficients mod n) 2004-09-07 paul * listz.c, median-aux.c, median.c, polyeval.c, ecm.h, ks-multiply.c: cleaned up polyeval_tellegen and median.c: - converted comments in english - use functions of listz.c when possible - translate mpz_mul_ui (.., .., 2) into mpz_mul_2exp - other tiny optimizations * TODO: sorted items * pm1.c, pp1.c, main.c: default for -ticdelay is now -1 * ecm.h: added comments about #define's 2004-09-06 paul * mpmod.c: oops, modulus->bits can be negative * mpmod.c: modulus->bits is always positive * ecm.h, mpmod.c, pm1.c: now isbase2() is called with same threshold BASE2_THRESHOLD (in ecm.h) base2mod() changed to perform no division any more 2004-09-03 alex * stage2.c, listz.c: Where possible, polynomials F, G and invF are deallocated before calling polyeval(), polyeval_tellegen() or poly_gcd(). * ecm.h: POLYEVAL got set whenever POLYEVALTELLEGEN was set, overriding POLYGCD. Changed so that POLYGCD overrides both POLYEVAL and POLYEVALTELLEGEN. 2004-09-03 paul * stage2.c: experimental code to save/restore the product tree 2004-09-03 alex * schoen_strass.c, listz.c: PrerevertDivision() uses a non-zeropadded transforms for Fermat numbers if poly degree is not too large. Removed Matrix Fourier Algorithm (no faster) and testing code in schoen_strass.c 2004-09-03 paul * ks-multiply.c: added comment 2004-09-01 laurent * Makefile.am, configure.in: Autoconf'ing the project. 2004-09-01 alex * ecm.h, listz.c, schoen_strass.c: Added option for transform without zero padding to Sch"onhage-Strassen. 2004-08-31 paul * ks-multiply.c: thresholds now takes into account number size too 2004-08-31 alex * TODO: Added entry for idea of choosing roots of F and G from Montgomery "Speeding", 9.1.3 2004-08-31 paul * Makefile, ks-multiply.c, listz.c: improved PrerevertDivision when KS_MULTIPLY defined 2004-08-30 alex * Makefile, auxi.c, ecm.h, listz.c, schoen_strass.c, toomcook.c: Sch"onhage-Strassen code now can multiply monic polynomials directly. This saves adds when building polys from their roots, unfortunately the speed gain is minimal. mpz_divby3_1op() moved to auxi.c 2004-08-30 paul * ks-multiply.c: Kronecker-Scho"nhage's code, contributed by David Newman * Makefile, bestd.c, ecm-gmp.h, ecm.h, listz.c, mpmod.c, polyeval.c, stage2.c: integrated David Newman's Kronecker-Scho"nhage's code * schoen_strass.c: commented out mpz_divby3_1op (already in toomcook.c) * TODO: added one todo line (KS mult) 2004-08-26 alex * bestd.c, ecm.h, ecm2.c, stage2.c: bestd_po2() examines both B2min and B2 instead of just their difference, and has better d values for very small polynomial degrees. * resume.c: Resuming now always reduces the x-coordinate (mod N). * schoen_strass.c: Multiplication of transformed coefficients inside the MFA transform. Should save uncached memory accessed but isn't much faster in practice. * ecm.h, ecm2.c, stage2.c: Correct handling of adding identical points in addWnm() by doubling. * Makefile: Commented out Sun cc specific options * schoen_strass.c: Routines for Sch"onhage-Strassen multiplication of polynomials 2004-07-28 alex * pm1.c, pp1.c, resume.c, stage2.c, Makefile, auxi.c, bestd.c, ecm-gmp.h, ecm.h, ecm2.c, listz.c, main.c, median.c: Schoenhage-Strassen for multiplying polynomials modulo Fermat numbers. Computation of roots for F and G for ECM does several progressions in parallel, to reduce number of extgcds and, for F, reduce the number of roots computed. * eval.c: Sun cc doesn't like leading underscores in variable names. _B and _N renamed to B and N. 2004-06-24 laurent * Makefile, README, ecm.1: Fix typo in documentation and clarify the `nobase2' option. 2004-04-06 paul * bestd.c, listz.c: added Weimerskirch/Paar trick for Karatsuba (K=3) 2004-02-13 laurent * median.c: Deleted some unused lines. 2004-01-21 laurent * ecm.1: Added ecm.1 man page; needs update. 2004-01-20 laurent * ecm.h, main.c: Added usage function and changed usage output stream to stdout. 2004-01-20 jim * main.c: Fixed for MinGW builds (no res/resource.h) 2004-01-20 laurent * main.c: Fixed "-n" and "-nn" parameters for unix. 2004-01-16 jim * README, TODO, candi.c, ecm.c, ecm.h, main.c, pm1.c, pp1.c: Added -go handling. The code to use this has NOT been added to ecm_stage1() or pp1_stage1(). It has been placed into pm1_stage1(). Also, the full syntax processing has been added to main.c. Candi.c has a new structure to handle this (mpgocandi_t). can be of any valid expression form, and may contain N letter(s) as a placeholder for the current processing candidate number. * stage2.c, pp1.c, pm1.c: Modifications to allow screen percentage ticks to run through a global function. That function will only update at the proper delay, or possibly NOT show anything at all * main.c: Modifications to allow screen percentage ticks to run through a global function. That function will only update at the proper delay, or possibly NOT show anything at all The showscreenticks() and showscreenticks_change_stage() functions are in this source file. * ecm.h, ecm.c: Modifications to allow screen percentage ticks to run through a global function. That function will only update at the proper delay, or possibly NOT show anything at all * TODO: Added information about -ticdelay n (note ticdelay -1 turns off the percentage done stuff). Any other n value simply is the number of ms between percentage done stderr updates. 2004-01-12 jim * smartprp.c: Bug in the hex escape sequence parser in the -prp cmd * README: Added explanation of the escape %xH[H] to the -prp cmd * smartprp.c: Added the excape %xH[H] to the -prp cmd cmd parsing. 2004-01-12 paul * smartprp.c: new file (added for Jim) 2004-01-12 jim * ecm.h: Updated beta version and added new declarations for the smart prp function * main.c: Use the new "smart" prp function and parse the new -prp* command args * candi.c: Use the new "smart" prp function * Makefile: Updated with new smartprp.c (and updated DIST to be all .c files) * README: Added new section about exteral spawned prp app, and added to the command line section * ChangeLog: Added info about changes since 5.1-beta (Hex expressions and VC porting in 5.1.1-beta) (External PRP program spawning in 5.1.2-beta) 2004-01-12 paul * ecm.h, eval.c, main.c, pm1.c, pp1.c, candi.c: number of primality loops is now controlled by PROBAB_PRIME_TESTS (ecm.h) 2004-01-09 laurent * main.c: Fixed "warning: C++ style comments are not allowed in ISO C90" 2004-01-09 jim * auxi.c, resume.c, trial.c, main.c: Porting needed for VC to build * median.c: Port change needed to compile under VC * ecm-gmp.h: Porting needed for VC to build This latest change taken from the latest (4.1.2) GMP's gmp-impl.h file. * eval.c, candi.c, bestd.c: Porting needed for VC to build * ecm.h: Bumped up beta version number. This version has hex number handling in expressions, and is ported to VC 2004-01-06 jim * eval.c: Added handling of Hex numbers to the expression parser. * median.c: Modified to build under MinGW (and VC) * bestd.c: Modified to comiple under MinGW 2003-12-12 paul * polyeval.c: changed copyright line * Makefile, bestd.c, ecm.h, listz.c, median.c, polyeval.c, stage2.c: several improvements in polyeval_tellegen (now default) 2003-12-11 paul * ecm.h, median.c, polyeval.c, bestd.c: added option -DCHECK_MULS to check number of muls * stage2.c: added warning when estimated and computed muls differ * polyeval.c: a factor of 2 was missing in muls_tuptree (case l=m) * median.c: now muls_tgen calls directly muls_gen * listz.c: replaced calls to toomcook4/toomcook4_low/toomcook4_high by calls to LIST_MULT_N/list_mul_low/list_mul_high * ecm.h: type of multiplication (kara, toom3, toom4) is now defined in ecm.h * bestd.c: replaced muls_toom4 by muls_gen 2003-12-10 laurent * polyeval.c: Multiplication accounting is added in polyeval_tellegen. 2003-12-09 laurent * stage2.c: Timing added when using polyeval_tellegen. 2003-12-09 paul * mpmod.c: changed temp1 -> temp2 in mpres_out_str 2003-12-09 laurent * ecm.h, median-aux.c, median.c, polyeval.c: Polyeval_tellegen now uses transposed Toom Cook. 2003-12-05 alex * countsmooth.c: Reports values of lucky Brent-Suyama matches. Barely tested, beware! 2003-12-05 laurent * median.c: Used better ad hoc divisions by 2 and 3. 2003-12-04 laurent * median.c: Fixed bugs in transposed Toom Cook multiplication with weird argument sizes. 2003-12-03 alex * Makefile, countsmooth.c: computes roots at 1 (mod 6). -pm1, -blocks and -ecm flags. Makefile target. 2003-12-03 laurent * Makefile, median.c: Transposed Toom-Cook3 should work now. 2003-12-02 alex * countsmooth.c: Can use getprime() now. * countsmooth.c: Tool to count smooth values. Supports Brent-Suyama. 2003-12-01 alex * TODO: Added Colin Percivals generalized DWT, moved rootsF [j*d, i] -> [j*d+1, i] to done. 2003-12-01 laurent * ecm.h, median.c, polyeval.c: Fixed muls_tkara. 2003-11-27 paul * TODO: added new entry 2003-11-27 laurent * Makefile, bestd.c, ecm.h, median.c, polyeval.c, stage2.c: Added a new multipoint evaluation function in stage2 (polyeval_tellegen) with associated functions. #define POLYEVALTELLEGEN if you want to use it. 2003-11-26 paul * COPYING.LIB, Makefile: added COPYING.LIB (for ecm-gmp.h and memory.c) 2003-11-24 paul * test.ecm: put in sh syntax * stage2.c: comptue one more term of 1/F (needed for TupTree) 2003-11-19 paul * INSTALL: new timings for ppc 2003-11-18 paul * main.c: updated minimum sizes for potential champions * INSTALL: updated timings for ppc 2003-11-17 paul * INSTALL: added timings for ppc 2003-11-07 paul * ecm.h: new function mpres_realloc * ecm.c, ecm2.c: defined local procedures as 'static' * mpmod.c: mpz_mod_n now always assumes ALLOC(r) >= nn * stage2.c: check alloc. size of f (used as mpres_t) * test.ecm: added new tests for ecm-5.0.1 bug 2003-11-05 paul * ecm.h: changed some prototypes, added some others * toomcook.c: redirect directly to karatsuba in toom4 * test.pp1: added newline before ok * test.pm1: added newline before ok message * test.ecm: ok message made similar to other tests * stage2.c: now compute estimated number of muls for stage2 (and prints corresponding percentage) * polyeval.c: modified calls to RecursiveDivision * pm1.c: removed unused variable * mpmod.c: incorporated changes to fix ecm-5.0.1 bug (overflow in input of mpz_mod_n) * listz.c: new routines to compute short products (low and high) * check.mpl: new routines to determine numbers of muls of short products * bestd.c: completely rewritten (now determine the exact number of muls and not an estimation) * TODO: removed several done items 2003-10-31 paul * test.ecm: added test for bug in 5.0.1 2003-10-20 paul * Makefile, auxi.c, bestd.c, candi.c, eval.c: changes suggested by Laurent Fousse to enable compilation with gcc 3.3.2 * TODO: removed done item 2003-10-17 paul * ChangeLog, ecm.c, ecm.h, main.c, pm1.c, pp1.c, stage2.c, test.ecm, test.pp1: changed quiet mode (-q) to print all factors on same line (contributed by Laurent Fousse) 2003-10-16 paul * Makefile, bestd.c, ecm.h, ecm2.c, stage2.c, test.ecm, test.pm1, test.pp1: put in patch from Alex to solve "B2min too small" problem 2003-07-22 paul * TODO: added one item (-Q) * TODO: added one item * polyeval.c: fixed potential memory leak 2003-07-02 paul * bestdaux.c, candi.c, ecm2.c, eval.c, getprime.c, listz.c, main.c, ntl.c, pm1.c, stage2.c: added check for return value of malloc 2003-06-20 paul * Makefile, ntl.c: applied patches from Christian Cornelssen * stage2.c: moved check for overflow at the beginning * TODO: added suggestion from one user 2003-06-19 paul * TODO: added one item 2003-05-09 paul * main.c: smallest P+1 champion is now 37 digits 2003-04-22 alex * Makefile, mul_lo.c, stage2.c: Changes to make building with NTL work 2003-04-22 paul * INSTALL: fixed typo * test_sh.ecm, test_sh.pm1, test_sh.pp1: now replaced by test.* * test.ecm, test.pm1, test.pp1: converted from csh to sh * Makefile: now only GMP is linked statically 2003-04-16 jim * main.c: Changes for incremental saving (but commented out). * resume.c: Changes to incremental resuming (but it is commented out) * stage2.c, pp1.c: Stage 2 overflow fixes from 5.01 * pm1.c: Stage 2 overflow fixes from 5.01 Added some new logic for the incremental save/resume, but it is commented out. * ecm2.c: Stage 2 overflow fixes from 5.01 * ecm.h: Stage 2 overflow fixes (from 5.01). * ecm.c: Partial resume code commented out for now. * test_sh.ecm, test_sh.pm1, test_sh.pp1, test.ecm, test.pp1, test.pm1: Added stage2 overflow test * Makefile: Added ChangeLog to EXTRADIST section * ChangeLog: Version 5.01 additional file to ECM project * INSTALL: Added new changes which were in release 5.01 Added new information about MinGW32 timings (and compiling switches) * README: Updated to Pauls changes in 5.01 2003-04-11 paul * main.c: limit for P+1 champion is now 35 digits 2003-04-09 jim * auxi.c: Failed to set the "first time flag" and convert the double to unsigned to fix a roundoff bug. 2003-04-05 alex * mpmod.c: GMP prior to 4.1 does not have GMP_NUMB_BITS defined. Using __GMP_BITS_PER_MP_LIMB in that case. * resume.c: Bugfix: save file lines were not correct if both sigma and A value was given (a semikolon was missing). 2003-03-30 jim * README: Added section 6 "ECM-GMP Expression syntax" (and adjusted original sections 6-10 to 7-11) Added docs about b1_ainc.c candi.c trial.c eval.c and the test_sh.* files which are newly added to version 5.1 List trial div, expressions, and looping in section 2 (new major items since ecm4c). Added the -I f and -i n and -B2scale switches to section 3 (efficient use of ecm-gmp) Changed section 5 (memory usage) to list that in -b breadthfirst mode (the default for -inp), that the whole file is read into memory at one time, thus increasing memory footprint (but for good reasons). Added [-inp file] as an optional paramter (even though it is really an option), and list that the redirection of a file i.e. < file is now optional [ < file]. It is optional because of -inp file. Added -t n to "Options to control factorization method" Added -B2scale f to Options to control step 2 Added -cofdec and -ve n to Options to control output Added info about incremental saving during B1 stage, and also documented that it is currently NOT working Added options "-i n", "-I f", "-n", "-nn", "-one" and "-b -d" to Miscellaneous options section. Added information that the first known problem has been eliminated (at least I know it is gone in my MinGW32 builds). Added 2 new "known problems" which are caused by the new stderr output for the 1 line output, and for the stage1/stage2 "percentage" done screen output (which btw, the stage2 percentage done is not yet done ;) The "second" of these new known problems is not a "certain" problem. It is one that someone (Paul) needs to look into and see how the interaction of this new stderr output works with the client server, and with things like nohup & under unix (I myself don't have access to any *nix shell) * main.c, pm1.c, pp1.c, ecm.h, ecm.c: Added a B2 scaling factor (user supplied multipler for calculated B2 values) 2003-03-25 jim * ecm2.c: ecm.h was modified, but ecm2.c was missed and needed modified also. * resume.c: The incremental saving function has been neutered until it is working correctly in ecm/pm1/pp1 stage 1 functions. * main.c: Added -cofdef (forced cofactor in decimal even if an expression is "valid") Added a Cnnn "header" to the B1= .. line listing the length of the candidate (unless the candidate expression is explicitily listed) Fixed a couple of spots where the stderr 1 line output (in > redirection mode) was not erasing the prior line * auxi.c: Added high resulution timers to MinGW and MSVC builds. 2003-03-14 jim * main.c: B2min was broken due to handling of the -i n and -I f auto incrementation code. 2003-03-14 paul * main.c: smallest top-ten P-1 has now 39 digits * test.pm1: B1 was too small in new test * pp1.c, stage2.c, test.pm1, pm1.c: added check for overflow in stage2 + test * ecm.h: changed "unsigned int" into "unsigned long" for s parameter 2003-03-12 paul * Makefile: added -pedantic 2003-03-12 jim * main.c: On "usage" screen, had not renamed -a n to -i n, nor had -I f been added. Removed a C++ comment. * main.c: Removed C++ comment * resume.c, candi.c: Removed C++ comments * resume.c: Removed a couple C++ comments * pp1.c: Fixed Boo boo. * pm1.c: Fixed boo boo. * eval.c: Removed C++ comments * ecm.c: fixed boo boo. * b1_ainc.c, auxi.c: Removed some C++ comments * main.c: Changed default "shallow" mode factor finding to "deep" mode (continues to find more factors Removed the -deep command line switch Added command line switch -one (forces ECM back into shallow mode). -i file changed to -inp file. -ib file removed. -i defaults to width-first and cat file | ecm b1 defaults to depth-first searching. added a -d for depth-first processing (to complement the -b breadth-first mode). Removed -a n (constant B1 increment mode) Added -i n (constant B1 increment mode) Added -I f (Auto calculation (with scaling by f) B1 increment mode) Removed stage 1 precentage "setup" output. That is now output in the 3 stage1 functions. * README: Added new options to document, but there is very little docmentation about them yet. THIS IS STILL todo * TODO: Updated todo list with new items, and moved items that are done, or partly done * pp1.c: Finished stage 1 percentage counter * pm1.c: Finished stage1 percentage done * ecm.c: Completed percentage done in stage one, and removed 'setup' output for stage2 * stage2.c: Added "starting" code for percentage counter on stage 2 * ecm.h: Added declaration of B1 incrementation function * b1_ainc.c: Code to do auto-incrementation of B1. It works with a constant increment value, or it computes the "ideal" increment (based on current B1). It can also scale this calculated "ideal" * toomcook.c: Removed ABS define, since it is now in ecm.h * Makefile: Added new source file b1_ainc.c (does "auto" incrementation of B1 values) 2003-03-11 paul * main.c: changed P-1 champion limit * main.c, pp1.c: fixed problem with potential champions for -pp1, when it performs P-1 * ecm-gmp.h, ecm.h: moved ABS to ecm.h 2003-03-07 jim * TODO: Updated the % done item. * pp1.c, pm1.c: Added Stage 1 screen percentage updating. Added Stage 1 Auto incremental saving * main.c: Added -deep to trial factoring (actually added !(-deep) Added space after function calls to bring my code into existing coding specifications. * ecm.h: Added -deep command to trial factoring * trial.c: If we are not in -deep mode, then bail out after first factor is found * TODO: Added -qq for ultra quiet mode (i.e. for running under the client/server) * eval.c: Fixed bugs introduced removing C++'ism. The max val needed to be adjusted before the realloc, or we would simply realloc the same sized buffer (and then overwrite it) * main.c: Placed the fprintf(T:000) back where it needed to be. Remove any temp AutoSaved B1 file since it is no longer needed. * candi.c: Wrong function title in the validation check logic. * ecm.c: Added "plumbing" for incremental AutoSaving of B1 (every 15 minutes) !!NOT YET WORKING CORRECTLY!! Changed 1:000 percentage screen updates to once every 30 seconds. 2003-03-07 paul * Makefile, ecm.h: changed version to 5.1-beta 2003-03-07 jim * resume.c: Added const to char * in resume, and put an else so that sigma AND A could not both be written Created write_temp_resumefile() and kill_temp_resume_file() functions. * ecm.h: Added defines for write_temp_resumefile and kill_temp_resume_file (found in resume.c) Added spaces to my function declarations to more conform to the ecm standard. * eval.c: Added a strnicmp() function (in VC and MinGW, we use the native version) * TODO: Changed incremental saving and Percentage done to be partially done 2003-03-07 paul * ecm-gmp.h, ecm.c, eval.c, main.c, trial.c: fixed a few C++-specific idioms 2003-03-07 jim * ecm.c: Stage 1 ECM code for 1:000 to 1:100 done and working fine. * trial.c: The "testing code" for the T:000 to T:100 was not quite in the right place 2003-03-06 jim * main.c: Added some code to do T:000 to T:100 (and 1: 2:) "percentage done" output to stderr (i.e. dummy lights) Changed Factors= to factors= on the stderr line in Breadthfirst mode, now show "loop count" stderr line on first loop. in Breadthfirst mode, show the current line processing/total lines in file on the output line. * eval.c: Added C++ style sinle line comment to expression parser Now lines starting with '#' is a comment, and from where ever a // is found, to the end of a line is a comment. * TODO: Added information about 80 dots and how to do that with the existing stderr output of the lines/loops * trial.c: Added code to do T:000 to T:100 "percentage done" output to stderr (i.e. dummy lights) * ecm.h: Updated "internal" version to current interim build 2003-03-05 jim * test_sh.pp1: P+1 Test script file (for borne shell) * test_sh.pm1: P-1 Test script file (for borne shell) * test_sh.ecm: ECM Test script file (for borne shell) * dummy: Bye bye dummy * dummy: Test for PauZ (and help for me) 2003-03-05 paul * candi.c, eval.c, trial.c: files from Jim 2003-03-04 jim * main.c: LOT of changes. New switches: -a n (auto increment B1 after each loop) -i file (input file, not from stdin) -ib file (breadth-first looping) -b (breadth-first looping for stdin) -deep (continued factoring after a factor is found) -n (nice mode, only fully implemented in Win32) -nn (VERY nice mode, i.e. idle) -t n (trial factoring, up to n) -ve n (verbose expression printing, for expressions < n chars) The read_number() now calls the expression parser. read_number is only used to skip blank (or commented) lines. The expression parser does the rest. Most changes took place in the "looping" code, since now we loop either width first (the current 5.0 default), or "breadth-first", and since during looping the program may handle found factors either "shallow" (the current default 5.0 mode), or -deep. * auxi.c: nb_bits now const. A MUCH improved rand generator for Win32 * TODO: Expression parse done. -nice done (not -kill). a "key" added so that todo's and done's can be listed * ecm-gmp.h: alloc.h did not work with MinGW. The change was taken from a GMP header * ecm.h: Created mpcandi_t struction (for warehousing info about the candidate) Use the mpcandi_t object instead of simple mpz_t where it is needed Added functions from candi.c, eval.c, trial.c (and main.c since read_number is needed by resume code) Made nb_bits be const * resume.c: Use mpcand_t for candidate numbers Output expressions in save/resume file (if expression exists) Patches for MinGW and MSVC for machine name * Makefile: Added candi.c eval.c and trial.c to Makefile 2003-03-04 paul * main.c: check mp_bits_per_limb = GMP_NUMB_BITS * toomcook.c: rewrote comparison in toomcook4() * test.ecm: updated for new bestD() * stage2.c: use new bestD() function * main.c: new semantics of -k * README: new semantics for -k option * main.c: now default k is 0 (lower bound) * ecm.h: updated wrt changes in bestd.c * check.mpl: new functions to compute number of multiplies in step 2 * bestd.c: new code using data generated by bestdaux.c * TODO: added several new items * Makefile, bestdaux.c: added bestdaux.c, auxiliary file to determine optimal parameters for bestd.c 2003-03-03 paul * INSTALL: updated timings for Athlon and EV6 * README: added comments about probability wrt Table 1 * tune.c: print MUL_KARATSUBA_THRESHOLD and DIV_DC_THRESHOLD * main.c: error when factor found is 1 * TODO: added 3 items * mul_lo.c: put ecm.h after gmp-impl.h * ecm-gmp.h, ecm.h: now all thresholds are in ecm.h (should go after gmp-impl.h or ecm-gmp.h) * tune.c: fixed typo found by Christian Cornelssen * mpmod.c: changes from Christian Cornelssen to change thresholds * README: added info on "tune" * Makefile: use LDFLAGS for tune added tune in clean target * INSTALL: added info on Darwin 2003-02-28 paul * test.ecm: added tests for bug found by Jim * ecm.c: fixed bug found by Jim Fougeron 2003-02-24 paul * TODO: added 1 item * INSTALL, Makefile: get rid of -LNTL/... in Makefile added instructions for LDFLAGS/-D__freebsd in INSTALL 2003-02-24 alex * pm1.c: Added comment to why we use Dickson(4), Dickson(6), x^12,.. 2003-02-24 paul * TODO: updated 2003-02-23 alex * README: small changes, notably maximum possible B1 for ECM and P+1. 2003-02-23 paul * ecm.c, main.c: now prints the *exact* number of digits for large numbers * TODO, listz.c: updated TODO changed list_gcd to use 'p' only at the end * README: stage -> step * Makefile, ecm.h: version is now 5.0 * INSTALL: minor changes * memory.c: added tests_memory_status * mpmod.c: changed threshold for isbase2 * resume.c: added space 2003-02-23 alex * toomcook.c: Made indentation more Gnuish * main.c: Revered order of opening save and resume file 2003-02-23 paul * ecm.h: removed __gmp_default_free * listz.c: style editing * getprime.c: forgot to reinitialize offset * ecm.c: style editing static val -> non static * TODO: added 2 items * Makefile: CFLAGS not needed in link phase 2003-02-22 alex * listz.c, lucas.c, main.c, mpmod.c, mul_lo.c, ntl.c, pm1.c, polyeval.c, polyz.c, pp1.c, resume.c, stage2.c, test.ecm, test.pm1, test.pp1, toomcook.c, tune.c, Makefile, TODO, auxi.c, bestd.c, ecm.c, ecm.h, ecm2.c, getprime.c: Changed copyright notice * auxi.c: Small cleanup of get_random_ui() 2003-02-21 alex * main.c: Initialising comment etc. with empty string to avoid comment fields containig garbage being printed. * pp1.c: Using n^2-1 instead of n-1 for inclusion in stage 1. Unconditionally setting g to 1 afterwards to avoid including n^2-1 again later. 2003-02-20 alex * README: Small changes. Comments on default values of k and Brent-Suyama updated. * ecm.h, mpmod.c: mpres_clear, mpres_set and mpres_swap are now macros * resume.c: addef fflush() after writing save file lines to avoid partial line in case of abort * main.c: Added test for existing save file 2003-02-20 paul * test.pm1: rm -> /bin/rm * README, ecm-gmp.h: put back #include by default (unless __freebsd is defined) * ecm.h: defined macro FREE * pm1.c, pp1.c: don't print x0 when resume * memory.c: exported tests_free * main.c: __gmp_free_func -> FREE * ecm2.c: use variable for 2S+2 * ecm.c: use fprintf for error * ecm-gmp.h: use alloca when __sun is defined * auxi.c: use macro FREE * TODO: added some items * README: added note about different -save and -resume names added note about alloca.h problems * Makefile: added -static * Makefile: more changes from Granlund * Makefile, ecm.h: changed version to 5.0-beta-pl3 improved Makefile clean entry 2003-02-19 alex * auxi.c: Made changes suggested by Torbjorn * ecm2.c: Fixed bug freeing unallocated vars in ecm_rootsG_init(). (thanks again Torbjorn) * auxi.c, ecm.h, main.c: Better seed for RNG 2003-02-19 paul * Makefile, auxi.c, ecm-gmp.h: patches from Granlund for FreeBSD * Makefile, ecm-gmp.h, ecm.h, mpmod.c, test.ecm: put extract from gmp-impl.h in separate file (copyright is different) 2003-02-19 alex * ecm.c, ecm.h, ecm2.c, mpmod.c, pm1.c, pp1.c, stage2.c: P-1 selects reasonable degree for Brent-Suyama if none given by user *_roots[FG] return number of multiplies used 2003-02-19 paul * stage2.c, test.ecm, README, TODO, main.c: added -c option improved documentation * Makefile, ecm.h: changed version to ecm-5.0-beta-pl2 * Makefile, ecm.c, ecm.h, ecm2.c, mpmod.c, pp1.c, stage2.c: fixed warnings with -W * test.pm1: forced remove 2003-02-18 paul * bestd.c, ecm2.c, lucas.c, main.c, memory.c, resume.c, stage2.c: changed stream back to stdout for factors found * README, TODO: note on B2 > 100*B1 added items in TODO 2003-02-18 alex * ecm.c: The temp variable "t" was passed around, but never actually used anywhere. Removed. * ecm.h, ecm2.c, mpmod.c, pm1.c, stage2.c, test.pm1: ECM and P-1 print a message if a factor is found during the computation of the roots of F or G and verbosity is >= 2 Eliminated "comparison between signed and unsigned" warnings when compiling with -W Added a test for saving/resuming to test.pm1 2003-02-16 alex * ecm.c: Added default values for Brent-Suyama's extension for ECM. * TODO: Added a suggestion from Jay Berg, and one from me 2003-02-16 paul * INSTALL, Makefile, README, TODO, bestd.c, ecm.c, ecm.h, main.c, mpmod.c, pm1.c, pp1.c, stage2.c, test.pm1, test.pp1: fixed several issues found by Jay Berg 2003-02-16 alex * main.c: Fixed generation of random sigmas (new sigma for each input number) 2003-02-15 alex * pm1.c: cascade_mul_ui replaced by cascade_mul_d to avoid nasty bug: the power of a small prime is accumulated and *then* passed to cascade_mul_ui, so that overflow would occur for B1 >= 2^32. * resume.c: Make read_resumefile_line less ugly. Also reads Prime95 v22 ECM residues now. The users name and machine name are written when saving to file. 2003-02-14 paul * README, TODO: remarks from Jay Berg * INSTALL, README: added advertizing for ECMNET * TODO, pm1.c: fixed problem with mul_casc (powerpc630-ibm-aix5.1.0.0) * Makefile, README: added c155 in distrib * Makefile, README, ecm.h: version is back to 5.0-beta added timings in README * Makefile: missing tab * Makefile: get rid of recursive make * mpmod.c, mul_lo.c: __GMP_BITS_PER_MP_LIMB -> GMP_NUMB_BITS use inline only with gcc * INSTALL: moved comments about CC/CFLAGS * c155: test number for ecm efficiency * INSTALL: added comment about CC/CFLAGS * resume.c: changed char c to int c in freadstrn (otherwise c != EOF always true on irix64) * mpmod.c: put back reduction in mpres_add/sub * Makefile: transfer LD in recursive make * mpmod.c: fixed two problems (missing include alloca.h, add_nc not always defined) * pm1.c: include gmp-mparam.h only when WANT_GMP_IMPL * ecm.h, listz.c, polyeval.c: now list_mul_z also reduces mod modulus * pm1.c: fixed typo * mpmod.c: now mpz_mod_n takes both source and destination, to avoid copies * main.c: added -primetest option * listz.c: added function to check size of residues (DEBUG) * ecm.c: used swap in add3 to avoid copies * TODO: removed done item (-primetest) 2003-02-13 paul * Makefile, ecm.h: changed version * mul_lo.c, pp1.c, stage2.c, mpmod.c: minor editing * main.c: updated on-line help * listz.c: added missing space * cputime.h: now in auxi.c * TODO: removed 2 done items * README: updated with -resume, new files, etc * ecm.h, main.c, mpmod.c, pm1.c, pp1.c, ecm.c: implemented -nobase2 option * mpmod.c, mul_lo.c: got rid of gmp-impl.h (if WANT_GMP_IMPL not defined) * Makefile: removed -pedantic now -DPOLYEVAL is no longer needed (it is the default) * ecm.h: define MUL_KARATSUBA_THRESHOLD (if not already) define POLYEVAL (if not POLYGCD) * main.c: replaced #ifdef POLYEVAL by #ifndef POLYGCD (now POLYEVAL is the default) 2003-02-13 alex * main.c: Small change to "Usage" text: added remark that -resume can read from stdin. 2003-02-12 paul * main.c: updated on-line help 2003-02-12 alex * resume.c: I'll try to stop violating my own specs. METHOD= values changed from PM1 and PP1 to P-1 and P+1. 2003-02-12 paul * mpmod.c: improved mpz_mod_n (gain of about 10%) * test.ecm: added -k for "extra" factor * TODO: removed done items * Makefile: removed mul_hi from tune target * mpmod.c: new REDC at mpn-level, using fast mpn_mul_lo * main.c: changed default number of blocks for POLYEVAL * ecm.h: added prototype for mul_lo * README: added "how to get the best of GMP-ECM" * mul_lo.c: low-half multiplication * tune.c: to tune mpmod algorithms * Makefile: added mul_lo and tune * stage2.c: removed number of muls without -v * Makefile: replaced CXX by LD when appropriate 2003-02-12 alex * resume.c: Routines for saving/resuming residues 2003-02-12 paul * stage2.c: fixed another memory leak * stage2.c: fixed memory leak * pm1.c: added default POWM_THRESHOLD 2003-02-11 alex * test.ecm, test.pm1, test.pp1: Modified test suites to work with new command line paramters * Makefile, ecm.c, ecm.h, ecm2.c, main.c, pm1.c, pp1.c, stage2.c, test.ecm: Added -resume option. Sigma, the A paramter and starting point are now specified by the command line option -sigma, -A and -x0. 2003-02-08 paul * INSTALL: added comment about editing Makefile 2003-02-06 alex * toomcook.c: Changed copyright. 2003-02-06 paul * mpmod.c: added mpn-version of REDC * Makefile: ecm5 -> ecm * test.pm1: put missing | * Makefile: removed -static 2003-02-05 paul * TODO: added one item * mpmod.c: added cast to mp_limb_t * Makefile: added GMP= and NTL= in recursive make call * Makefile: updated VERSION * INSTALL, README, ecm.h: version is now 5.0-beta updated INSTALL and README * main.c: use ECM_VERSION for -save * test.ecm, test.pm1: removed too long tests * TODO: added item for long term * toomcook.c: added spaces * test.ecm, test.pm1, test.pp1: added license * stage2.c: removed #ifdef INVF (now INVF always used) added B2min added total count of muls * pp1.c: added B2min * polyz.c: removed/commented unused code * polyeval.c: added count of muls * pm1.c: updated copyright line added B2min * ntl.c: updated copyright line * mpmod.c: fixed efficiency problem in mpz_mod_n * memory.c: updated document origin * main.c: added champion treatment allow rational seed added B2min * lucas.c, listz.c, getprime.c: updated copyright line * ecm2.c: updated copyright line removed unused code * ecm.h: added licence modified prototypes to include B2min * ecm.c: updated copyright line added B2min in args to ecm() * bestd.c: updated copyright line * auxi.c: updated copyright line added cputime (from cputime.h, now removed) * TODO: removed items done * README: added explanation on - how to use P-1, P+1, ECM - table of optimal B1, B2 - memory usage - option -save * Makefile: added licence moved polyz.c in EXTRAFILES * INSTALL: updated (NTL not needed any more) 2003-02-04 paul * polyeval.c: implement algorithm POLYEVAL * stage2.c: adapted for polyeval * polyz.c: commented poly_gcd when POLYEVAL * mpmod.c: cosmetic changes * listz.c: several changes for polyeval, in particular modified PolyFromRoots to complete the whole product tree * ecm.h: several changes for polyeval * auxi.c: added ceil_log2 * Makefile: adapted to allow both compilation with NTL (POLYGCD=1) and without (default) * test.ecm: added -k option to one test with g1 > B2 * TODO: added one item * main.c: added -power in on-line help * main.c: added option -power 2003-01-30 alex * ecm.c, ecm.h, ecm2.c, main.c, mpmod.c, pm1.c, pp1.c, stage2.c: Command line option -dickson to control whether Dickson polys are used or not. Factors found in ecm_rootsG are handeled properly. Added checksum to save file lines. 2003-01-29 alex * main.c, pm1.c, pp1.c: P-1 and P+1 are again exponentiating by N-1 and the default seed for P-1 is random again, which I had disabled during test runs. 2003-01-27 alex * mpmod.c, pm1.c, pp1.c, stage2.c, ecm.c, ecm.h, ecm2.c, main.c: Choosing the modulo reduction algorithm inside the different factoring algos now, residue and modulus passed to pm1(), pp1() and ecm() are mpz_t again. Initialisation for different mod algos are separate functions now to allow specifically choosing one. New command line paramters added for doing so. 2003-01-27 paul * mpmod.c: added explicit casts for 64-bit machines * main.c: char -> int for return value of getchar() 2003-01-13 alex * ecm.h, mpmod.c: Another attempt to check in the MODMULN code. 2003-01-12 alex * pp1.c: Fixed typo and a small memory leak in rootsG_init/clear. 2003-01-10 paul * Makefile: added mpmod.{c,o} 2003-01-03 alex * mpmod.c: Basic functionality for modular arithmetic. Plain mpz_mod, base-2 and REDC are implemented. MULMODN is to follow. * listz.c, lucas.c, main.c, memory.c, pm1.c, pp1.c, stage2.c, ecm.c, ecm.h, ecm2.c: Changes for using mpmod arithmetic. Moved computation of roots of F and G into ecm2.c, pp1.c and pm1.c, respectively. 2002-12-20 paul * stage2.c: comments should be in standard C format /* ... */ and not in C++ format // ... 2002-12-19 alex * auxi.c, ecm.h, stage2.c: Basic functions for Dickman's polynomials. Not enabled yet. 2002-12-17 paul * test.ecm: test file for ecm (from ecm4c) * check.mpl: added useful routines for ecm * stage2.c: adapted for ecm stage 2 * main.c: removed temporary try with MPM * pp1.c: added computation of multiplies added routine to check if factor found by P-1 or P+1 now always uses PRAC * polyz.c: inhibit memory check functions during NTL call * pm1.c: adapted to generic stage 2 * main.c: added memory check functions (with -DDEBUG) fixed some memory leaks * lucas.c: started to adapt to generic modular multiplication * listz.c: put mpz_mulmod macro in ecm.h * ecm.h: modified for ecm stage 2 * ecm.c: adapted for stage 2 * TODO: removed 2 items * ecm2.c: functions for ecm stage 2 * Makefile: added ecm2.c and memory.c * memory.c: file to check memory allocation 2002-12-06 paul * pm1.c, pp1.c, ecm.c: added fflush() after "Using sigma/seed=..." * TODO: added format proposal for save/restore * TODO: changed one item * TODO: added 4 items 2002-12-05 alex * ecm.c, ecm.h, main.c, pm1.c, pp1.c, stage2.c: Support for save files half finished. Factors are returned in a new variable, f. p is only for passing residues around, and stage2() must not change p. New parameter: B1done, which tells factoring functions up to which bound stage 1 has been completed before. Save files can be created, but not read in yet. 2002-12-03 paul * Makefile: aux.c -> auxi.c 2002-11-30 paul * aux.c, auxi.c: renamed aux.c to auxi.c (problems under Windows) 2002-11-29 paul * pp1.c: get rid of count_leading_zeros * Makefile, lucas.c: added file lucas.c * stage2.c: changed order of operands in pp1_mul * pp1.c: now uses Lucas sequences when PRAC is defined * main.c: defined B1cost for ECM * ecm.h: changed order of arguments of pp1_mul_ui for consistency added prototype for pp1_mul_prac * ecm.c: cosmetic changes * TODO: added several items 2002-11-28 paul * INSTALL, Makefile, README: added INSTALL file * pp1.c, stage2.c: use pp1_mul instead of pp1_mul_ui to avoid depending on longlong.h * Makefile: put /usr/local/gmp and /usr/local/ntl as default directories for GMP and NTL * toomcook.c: changed to "gmp.h" for consistency * ecm.h, main.c, pm1.c, pp1.c: now use random seed for all methods when sigma=0 use getpid() in addition of time() for random seed 2002-11-27 paul * pp1.c: fixed bug in pp1_mul_ui 2002-11-26 alex * ecm.c, main.c, pm1.c, pp1.c: Cleaned up handling of default values and method-specific screen output. 2002-11-26 paul * test.pp1: test file for Williams P+1 * pp1.c: Williams P+1 method * stage2.c: adapted for P+1 * ecm.h: added prototypes for P+1 added 'method' argument to stage2 functions * pm1.c: passed method=PM1_METHOD to stage2() call * main.c: added P+1 * ecm.c: added method in stage2() call * README: todo in a separate file updated * TODO: TODO file :-) * Makefile: added pp1.c 2002-11-25 paul * ecm.h, listz.c, stage2.c: added INVF trick (precomputation of 1/F to speed up divisions by F) * check.mpl: added auxiliary functions to compute numbers of multiplies of karatsuba, toomcook3, toomcook4 * README: removed one item done in TODO, added one more 2002-11-24 alex * toomcook.c: Proof for temp space now reflects the reduced memory requirements of toomcook3(). No change to code itself. 2002-11-19 paul * ecm.h, listz.c: karatsuba() returns int again 2002-11-19 alex * ecm.c, ecm.h, main.c: Added ECM stage 1. In ecm.h, return type of karatsuba changed to void again, to match karatsuba in listz.c. * listz.c: Toom-Cook 4 is default again. buildG now prints timing info to stdout instead of stderr. 2002-11-15 paul * ecm.h: return type changed to int * toomcook.c: optimized karatsuba/toomcook3/toomcook4 thresholds for minimum of scalar multiplies * test.pm1: added one test 2002-11-14 paul * main.c: replaced __GNU_MP_VERSION etc by gmp_version (better for dynamic library) * Makefile: removed -static 2002-11-07 alex * ecm.h, toomcook.c: Bug: forgot to change definition of T in toomcook.c, so the temp space required still was 2*len+4*log_3(len). Oddly, it did not crash. Fixed (the "too much space" problem, not "the did not crash" problem). 2002-11-06 alex * listz.c, toomcook.c: toomcook[34]() use <= 2*len+2*log_3(len) temp space now 2002-11-05 paul * check.mpl: included Williams P+1 stage 1 code and test inputs * ecm.h, main.c, pm1.c, stage2.c: replaced INVS macros by if-statements (use the x+1/x trick whenever Pollard P-1 is performed) * main.c: changed default choice of B2 so that stage 2 takes about half of stage 1 (assuming Toom-Cook 3 is used) * test.pm1: added 3 more tests changed -e 6 into -e 12 (if "invs" trick not used) * stage2.c: added INVS macro to enable "invs" trick * pm1.c: added trick from Peter for Cunningham numbers * main.c: changed default k to 7 set default S to 1 for ecm, 2 for Pollard (without INVS) * listz.c: only style changes 2002-11-04 alex * ecm.h, listz.c, toomcook.c: Added toomcook4(). 2002-11-01 alex * test.pm1: Added a factor that was missed by old polygcd code to test cases 2002-10-29 alex * ecm.h, stage2.c, test.pm1: rootsG uses batch inversion for large Suyamas powers. Tried to make indentation coherent. 2002-10-28 alex * stage2.c: Fixed bad merge, T was allocated twice 2002-10-27 alex * ecm.c, ecm.h, listz.c, main.c, pm1.c, stage2.c: Added Suyamas powers for stage 2. 2002-10-25 alex * toomcook.c: Added GPL header. Cleaned up comments a little. No change to code itself. 2002-10-25 paul * check.mpl: added function to compute Toom-Cook 3 auxiliary space * toomcook.c: edited according to GNU coding style and added analysis of temp. space needed * stage2.c: modified memory space for T * main.c: put back sigma=17 as initial value for P-1 * listz.c: incorporated toomcook3 code from Alex * ecm.h: added prototype for toomcook3 * Makefile: added toomcook.c * test.pm1: added one test * stage2.c: fixed bug in rootsG (G[0] was not set) * test.pm1: test file for Pollard P-1 * poly.c: replaced by listz.c (lists) and polyz.c (polynomials) * polyz.c: routines for polynomials of integers (mpz_t) * listz.c: routines for arrays of mpz_t's * ntl.c: NTL interface (first version) * stage2.c, pm1.c: added verbose flag * main.c: adapted for NTL interface * getprime.c: fixed problems with signed/unsigned integers * ecm.h: modified for NTL interface * ecm.c: added verbose flag * check.mpl: added routine for Pollard P-1 stage 1 * bestd.c: added missing stdlib.h * aux.c: added missing string.h * README: gcd stuff is ok now (using NTL) * Makefile: adapted for NTL interface 2002-10-24 paul * toomcook.c: Toom-Cook 3-way code from Alexander Kruppa 2002-09-13 paul * COPYING: New file. * COPYING: first version * Makefile, README, aux.c, bestd.c, check.mpl, cputime.h, ecm.c, ecm.h, getprime.c, main.c, pm1.c, poly.c, stage2.c: New file. * Makefile, README, aux.c, bestd.c, check.mpl, cputime.h, ecm.c, ecm.h, getprime.c, main.c, pm1.c, poly.c, stage2.c: first version ecm-6.4.4/ecm.h0000644023561000001540000001333712106741273010142 00000000000000/* ecm.h - public interface for libecm. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Paul Zimmermann, Alexander Kruppa, David Cleaver, Cyril Bouvier. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef _ECM_H #define _ECM_H 1 #include /* for FILE */ #include #ifdef __cplusplus extern "C" { #endif typedef struct { int method; /* factorization method, default is ecm */ mpz_t x; /* starting point (if non zero) */ mpz_t sigma; /* contains sigma or A (ecm only) */ int sigma_is_A; /* if 1, 'sigma' contains A (Montgomery form), if 0, 'sigma' contains sigma (Montgomery form), if -1, 'sigma' contains A, and the input curve is in Weierstrass form y^2 = x^3 + A*x + B, with y in 'go'. */ mpz_t go; /* initial group order to preload (if NULL: do nothing), or y for Weierstrass form if sigma_is_A = -1. */ double B1done; /* step 1 was already done up to B1done */ mpz_t B2min; /* lower bound for stage 2 (default is B1) */ mpz_t B2; /* step 2 bound (chosen automatically if < 0.0) */ unsigned long k;/* number of blocks in stage 2 */ int S; /* degree of the Brent-Suyama's extension for stage 2 */ int repr; /* representation for modular arithmetic: ECM_MOD_MPZ=mpz, ECM_MOD_MODMULN=modmuln (Montgomery's quadratic multiplication), ECM_MOD_REDC=redc (Montgomery's subquadratic multiplication), ECM_MOD_GWNUM=Woltman's gwnum routines (tbd), > 16 : special base-2 representation MOD_DEFAULT: automatic choice */ int nobase2step2; /* disable special base-2 code in ecm stage 2 only */ int verbose; /* verbosity level: 0 no output, 1 normal output, 2 diagnostic output */ FILE *os; /* output stream (for verbose messages) */ FILE *es; /* error stream (for error messages) */ char *chkfilename; /* Filename to write stage 1 checkpoints to */ char *TreeFilename; /* Base filename for storing product tree of F */ double maxmem; /* Maximal amount of memory to use in stage 2, in bytes. 0. means no limit (optimise only for speed) */ double stage1time; /* Time to add for estimating expected time to find fac.*/ gmp_randstate_t rng; /* State of random number generator */ int use_ntt; /* set to 1 to use ntt poly code in stage 2 */ int (*stop_asap) (void); /* Pointer to function, if it returns 0, contine normally, otherwise exit asap. May be NULL */ int batch; /* Batch mode */ double batch_B1; /* B1 is the limit used to calculate s for batch mode */ mpz_t batch_s; /* s is the product of primes up to B1 for batch mode */ double gw_k; /* use for gwnum stage 1 if input has form k*b^n+c */ unsigned long gw_b; /* use for gwnum stage 1 if input has form k*b^n+c */ unsigned long gw_n; /* use for gwnum stage 1 if input has form k*b^n+c */ signed long gw_c; /* use for gwnum stage 1 if input has form k*b^n+c */ } __ecm_param_struct; typedef __ecm_param_struct ecm_params[1]; #define ECM_MOD_NOBASE2 -1 #define ECM_MOD_DEFAULT 0 #define ECM_MOD_MPZ 1 #define ECM_MOD_BASE2 2 #define ECM_MOD_MODMULN 3 #define ECM_MOD_REDC 4 /* values <= -16 or >= 16 have a special meaning */ int ecm_factor (mpz_t, mpz_t, double, ecm_params); void ecm_init (ecm_params); void ecm_clear (ecm_params); /* the following interface is not supported */ int ecm (mpz_t, mpz_t, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, double, unsigned long, const int, int, int, int, int, int, FILE*, FILE*, char*, char *, double, double, gmp_randstate_t, int (*)(void), int, mpz_t, double, unsigned long, unsigned long, signed long); int pp1 (mpz_t, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, double, unsigned long, const int, int, int, int, FILE*, FILE*, char*, char *, double, gmp_randstate_t, int (*)(void)); int pm1 (mpz_t, mpz_t, mpz_t, mpz_t, double *, double, mpz_t, mpz_t, double, unsigned long, const int, int, int, int, FILE*, FILE*, char *, char*, double, gmp_randstate_t, int (*)(void)); /* different methods implemented */ #define ECM_ECM 0 #define ECM_PM1 1 #define ECM_PP1 2 /* return value of ecm, pm1, pp1 */ #define ECM_FACTOR_FOUND_STEP1 1 /* should be positive */ #define ECM_FACTOR_FOUND_STEP2 2 /* should be positive */ #define ECM_NO_FACTOR_FOUND 0 /* should be zero */ #define ECM_ERROR -1 /* should be non-zero */ #define ECM_FACTOR_FOUND_P(x) ((x) > 0) #define ECM_ERROR_P(x) ((x) < 0) #define ECM_DEFAULT_B1_DONE 1.0 #define ECM_IS_DEFAULT_B1_DONE(x) (x <= 1.0) /* stage 2 bound */ #define ECM_DEFAULT_B2 -1 #define ECM_IS_DEFAULT_B2(x) (mpz_sgn (x) < 0) #define ECM_DEFAULT_K 0 /* default number of blocks in stage 2. 0 = automatic choice */ #define ECM_DEFAULT_S 0 /* polynomial is chosen automatically */ /* Apple uses '\r' for newlines */ #define IS_NEWLINE(c) (((c) == '\n') || ((c) == '\r')) #ifdef __cplusplus } #endif #endif /* _ECM_H */ ecm-6.4.4/Makefile.am0000644023561000001540000001306312113421072011243 00000000000000## Process this file with automake to produce Makefile.in ACLOCAL_AMFLAGS = -I m4 # to not install libecm.la, we could write noinst_LTLIBRARIES instead of # lib_LTLIBRARIES below, however then libecm.a is not installed either # (see http://www.gnu.org/software/automake/manual/html_node/Libtool-Convenience-Libraries.html) lib_LTLIBRARIES = libecm.la EXTRA_PROGRAMS = rho batch # If we want assembly mulredc code, recurse into the right subdirectory # and set up variables to include the mulredc library from that subdir if ENABLE_ASM_REDC SUBDIRS = $(ASMPATH) MULREDCINCPATH = -I$(srcdir)/$(ASMPATH) MULREDCLIBRARY = $(builddir)/$(ASMPATH)/libmulredc.la # Add a tuning and testing program for the mulredc code EXTRA_PROGRAMS += bench_mulredc test_mulredc CLEANFILES = bench_mulredc test_mulredc bench_mulredc_CPPFLAGS = $(MULREDCINCPATH) bench_mulredc_LDADD = $(MULREDCLIBRARY) $(GMPLIB) test_mulredc_CPPFLAGS = $(MULREDCINCPATH) test_mulredc_LDADD = $(MULREDCLIBRARY) $(GMPLIB) else # Add a tuning program for the mulredc code EXTRA_PROGRAMS += bench_mulredc CLEANFILES = bench_mulredc bench_mulredc_LDADD = $(GMPLIB) endif libecm_la_SOURCES = ecm.c ecm2.c pm1.c pp1.c getprime.c listz.c lucas.c \ stage2.c toomcook.c mpmod.c mul_lo.c polyeval.c median.c \ schoen_strass.c ks-multiply.c rho.c bestd.c auxlib.c \ random.c factor.c sp.c spv.c spm.c mpzspm.c mpzspv.c \ ntt_gfp.c ecm_ntt.c pm1fs2.c mul_fft.c sets_long.c \ auxarith.c batch.c ellparam_batch.c # Link the asm redc code (if we use it) into libecm.la libecm_la_CPPFLAGS = $(MULREDCINCPATH) libecm_la_CFLAGS = $(OPENMP_CFLAGS) libecm_la_LDFLAGS = '-version-info 0:0:0' libecm_la_LIBADD = $(MULREDCLIBRARY) bin_PROGRAMS = ecm noinst_PROGRAMS = tune ecmfactor bench_mulredc # Most binaries want to link libecm.la, and the ones which don't will # have their own _LDADD which overrides the default LDADD here LDADD = libecm.la $(GMPLIB) ecm_CPPFLAGS = -DOUTSIDE_LIBECM ecm_CFLAGS = $(OPENMP_CFLAGS) ecm_SOURCES = auxi.c b1_ainc.c candi.c eval.c random.c main.c \ resume.c getprime.c champions.h tune_SOURCES = mpmod.c tune.c mul_lo.c listz.c auxlib.c ks-multiply.c \ toomcook.c schoen_strass.c polyeval.c median.c ecm_ntt.c \ ntt_gfp.c mpzspv.c mpzspm.c sp.c spv.c spm.c random.c \ mul_fft.c auxarith.c tune_CPPFLAGS = -DTUNE $(MULREDCINCPATH) tune_LDADD = $(MULREDCLIBRARY) $(GMPLIB) ecmfactor_CFLAGS = $(OPENMP_CFLAGS) rho_SOURCES = rho.c rho_CPPFLAGS = -DTESTDRIVE rho_LDADD = -lprimegen $(GMPLIB) $(GSL_LD_FLAGS) batch_SOURCES = batch.c batch_LDADD = $(GMPLIB) if MEMORY_DEBUG libecm_la_SOURCES += memory.c tune_SOURCES += memory.c endif if WITH_GWNUM gwdata.ld : echo "SECTIONS { .data : { . = ALIGN(0x20); *(_GWDATA) } }" >gwdata.ld libecm_la_SOURCES += Fgw.c # Use ecm_DEPENDENCIES += gwdata.ld instead? Is that possible? ecm_DEPENDENCIES = gwdata.ld ecm_LDFLAGS = $(AM_LDFLAGS) -Wl,gwdata.ld Fgwtest : Fgw.c gwdata.ld $(CC) $(CFLAGS) $(CPPFLAGS) -g -DTESTDRIVE -Wl,gwdata.ld -o Fgwtest Fgw.c libecm.a $(LIBS) endif include_HEADERS = ecm.h noinst_HEADERS = ecm-impl.h ecm-gmp.h ecm-ecm.h sp.h longlong.h ecm-params.h \ mpmod.h EXTRA_DIST = test.pm1 test.pp1 test.ecm README.lib INSTALL-ecm ecm.xml \ ecm-params.h.alpha-ev5 ecm-params.h.athlon64 \ ecm-params.h.default ecm-params.h.alpha-ev6 \ ecm-params.h.athlon ecm-params.h.powerpc7450 \ ecm-params.h.pentium3 ecm-params.h.pentium4 \ ecm-params.h.pentium-m ecm-params.h.powerpc970 \ ecm-params.h.mips64el ecm-params.h.armv5tel \ ecm-params.h.sparc64 ecm-params.h.ia64 \ ecm-params.h.hppa2.0 ecm-params.h.alpha-ev56 \ ecm-params.h.core2 ecm-params.h.corei5 \ mul_fft-params.h.athlon64 mul_fft-params.h.pentium3 \ mul_fft-params.h.default mul_fft-params.h.pentium4 DIST_SUBDIRS = athlon pentium4 x86_64 powerpc64 build.vc10 DISTCLEANFILES = config.m4 ecm-params: tune$(EXEEXT) @echo Optimising parameters for your system, please be patient. test -z "ecm-params.h" || rm -f ecm-params.h ./tune > ecm-params.h check: ecm$(EXEEXT) $(srcdir)/test.pp1 ./ecm$(EXEEXT) echo "" $(srcdir)/test.pm1 ./ecm$(EXEEXT) echo "" $(srcdir)/test.ecm ./ecm$(EXEEXT) longcheck: ecm$(EXEEXT) $(srcdir)/test.pp1 "$(VALGRIND) ./ecm$(EXEEXT)" $(srcdir)/test.pp1 "$(VALGRIND) ./ecm$(EXEEXT) -no-ntt" $(srcdir)/test.pp1 "$(VALGRIND) ./ecm$(EXEEXT) -modmuln" $(srcdir)/test.pp1 "$(VALGRIND) ./ecm$(EXEEXT) -redc" $(srcdir)/test.pp1 "$(VALGRIND) ./ecm$(EXEEXT) -mpzmod" $(srcdir)/test.pm1 "$(VALGRIND) ./ecm$(EXEEXT)" $(srcdir)/test.pm1 "$(VALGRIND) ./ecm$(EXEEXT) -no-ntt" $(srcdir)/test.pm1 "$(VALGRIND) ./ecm$(EXEEXT) -modmuln" $(srcdir)/test.pm1 "$(VALGRIND) ./ecm$(EXEEXT) -redc" $(srcdir)/test.pm1 "$(VALGRIND) ./ecm$(EXEEXT) -mpzmod" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT)" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -no-ntt" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -modmuln" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -redc" $(srcdir)/test.ecm "$(VALGRIND) ./ecm$(EXEEXT) -mpzmod" ## to be sure ecm.1 is considered as source ## (cf section "Man pages" in the automake manual) dist_man_MANS = ecm.1 ## If ./configure found xsltproc and docbook.xsl, add a rule for building ## the manpage. If they were not found, this rule is omitted and make will ## never try to rebuild the man page, which would result in an error anyway. if MAKE_MANPAGE ecm.1: $(srcdir)/ecm.xml xsltproc -o ecm.1 $(XSLDIR)/manpages/docbook.xsl $(srcdir)/ecm.xml endif ecm-6.4.4/x86_64/0000755023561000001540000000000012113421641010224 500000000000000ecm-6.4.4/x86_64/autogen.py0000755023561000001540000002135212106741272012175 00000000000000#!/usr/bin/python import re import sys def offaddr(addr, offset): if offset == 0: return "("+addr+")" else: return str(offset)+"("+addr+")" # Generate asm for addmul1_k # src and dst are pointers (stored in regs) + offsets # multiplier is in a register # rax, rbx, rcx, rdx are free for use. def addmul1_k(src, off_src, dst, off_dst, mult, k): init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %rax, %rbx, %rcx, %rdx\n" init = init + "### dst[0,k[ += mult*src[0,k[ plus carry put in rcx or rbx\n" init = init + " movq " + offaddr(src, off_src) + ", %rax\n" init = init + " mulq " + mult + "\n" init = init + " movq %rax, %rbx\n" init = init + " movq %rdx, %rcx\n" block = """ movq __xii__, %rax mulq __mult__ addq __cylo__, __zi__ adcq %rax, __cyhi__ movq %rdx, __cylo__ adcq $0, __cylo__ """ code = init cylo = "%rbx" cyhi = "%rcx" for i in range(0,k-1): blocki = re.sub('__cylo__', cylo, block) blocki = re.sub('__cyhi__', cyhi, blocki) blocki = re.sub('__xii__', offaddr(src, off_src+(i+1)*8), blocki) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*8), blocki) blocki = re.sub('__mult__', mult, blocki) code = code + blocki tmp = cylo cylo = cyhi cyhi = tmp final = " addq " + cylo + ", " + offaddr(dst, off_dst+8*(k-1)) + "\n" final = final + " adcq $0, " + cyhi + "\n" final = final + "### carry limb is in " + cyhi + "\n" code = code + final return code, cyhi ######## TODO: improve this code!!!! def mul1_k(src, off_src, dst, off_dst, mult, k): init = "### mul1: src[0] is " + offaddr(src, off_src) + "\n" init = init + "### dst[0] is " + offaddr(dst, off_dst) + "\n" init = init + "### mult is " + mult + "\n" init = init + "### k is " + str(k) + "\n" init = init + "### kills %rax, %rbx, %rcx, %rdx\n" init = init + "### dst[0,k[ = mult*src[0,k[ plus carry put in rcx or rbx\n" init = init + " movq " + offaddr(src, off_src) + ", %rax\n" init = init + " mulq " + mult + "\n" init = init + " movq %rax, %rbx\n" init = init + " movq %rdx, %rcx\n" block = """ movq __xii__, %rax mulq __mult__ movq __cylo__, __zi__ addq %rax, __cyhi__ movq %rdx, __cylo__ adcq $0, __cylo__ """ code = init cylo = "%rbx" cyhi = "%rcx" for i in range(0,k-1): blocki = re.sub('__cylo__', cylo, block) blocki = re.sub('__cyhi__', cyhi, blocki) blocki = re.sub('__xii__', offaddr(src, off_src+(i+1)*8), blocki) blocki = re.sub('__zi__', offaddr(dst, off_dst+i*8), blocki) blocki = re.sub('__mult__', mult, blocki) code = code + blocki tmp = cylo cylo = cyhi cyhi = tmp final = " movq " + cylo + ", " + offaddr(dst, off_dst+8*(k-1)) + "\n" final = final + "### carry limb is in " + cyhi + "\n" code = code + final return code def mulredc_k_rolled(k): header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc__k TYPE(GSYM_PREFIX`'mulredc__k,`function') GSYM_PREFIX`'mulredc__k: """ init = re.sub("__k", str(k), header) init = init + """ movq %rdx, %r11 movq %rcx, %r10 pushq %rbx pushq %rbp """ init = init + " subq $" + str(8*(2*k+1)) + ", %rsp\n" init = init + """# %r8 : inv_m # %r10 : m # %r11 : y # %rsi : x # %rdi : z # %rsp : tmp # Free registers # %rax, %rbx, %rcx, %rdx, %r9 ### set tmp[0..2k+1[ to 0 """ for i in range(0,2*k+1): init = init + " movq $0, " + offaddr("%rsp", 8*i) + "\n" code = init middle_code = "###########################################\n" middle_code = middle_code + " movq $" + str(k) + ", %rbp\n" middle_code = middle_code + """ .align 64 Loop: ## compute u and store in %r9 movq (%rsi), %rax mulq (%r11) addq (%rsp), %rax mulq %r8 movq %rax, %r9 """ codeaddmul, carry = addmul1_k("%r10", 0, "%rsp", 0, "%r9", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addq " + carry + ", " + offaddr("%rsp", 8*k) + "\n" middle_code = middle_code + " adcq $0, " + offaddr("%rsp", 8*(k+1)) + "\n" middle_code = middle_code + " movq (%rsi), %r9\n" codeaddmul, carry = addmul1_k("%r11", 0, "%rsp", 0, "%r9", k) middle_code = middle_code + codeaddmul middle_code = middle_code + " addq " + carry + ", " + offaddr("%rsp", 8*k) + "\n" middle_code = middle_code + " adcq $0, " + offaddr("%rsp", 8*(k+1)) + "\n\n" middle_code = middle_code + """ addq $8, %rsi addq $8, %rsp decq %rbp jnz Loop """ code = code + middle_code final = "###########################################\n" final = final + "### Copy result in z\n" for i in range(0,k): final = final + " movq " + offaddr("%rsp", 8*i) + ", %rax\n" final = final + " movq %rax, " + offaddr("%rdi", 8*i) + "\n" final = final + " movq " + offaddr("%rsp", 8*k) + ", %rax # carry\n" final = final + " addq $" + str(8*(k+1)) + ", %rsp\n" final = final + " popq %rbp\n" final = final + " popq %rbx\n" final = final + " ret\n" code = code + final return code def mulredc_k(k): header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc__k TYPE(GSYM_PREFIX`'mulredc__k,`function') GSYM_PREFIX`'mulredc__k: """ init = re.sub("__k", str(k), header) init = init + """ movq %rdx, %r11 movq %rcx, %r10 pushq %rbx """ init = init + " subq $" + str(8*(2*k+1)) + ", %rsp\n" init = init + """# %r8 : inv_m # %r10 : m # %r11 : y # %rsi : x # %rdi : z # %rsp : tmp # Free registers # %rax, %rbx, %rcx, %rdx, %r9 ### set tmp[0..2k+1[ to 0 """ for i in range(0,2*k+1): init = init + " movq $0, " + offaddr("%rsp", 8*i) + "\n" code = init for i in range(0,k): blocki = "###########################################\n" blocki = blocki + "### Step " + str(i) + "\n" blocki = blocki + "### Compute u and store in %r9\n" blocki = blocki + " movq " + offaddr("%rsi", 8*i) + ", %rax\n" blocki = blocki + " mulq (%r11)\n" blocki = blocki + " addq " + offaddr("%rsp", 8*i) + ", %rax\n" blocki = blocki + " mulq %r8\n" blocki = blocki + " movq %rax, %r9\n" blocki = blocki + "### tmp[i,i+k] += x[i]*y + u*m\n" codeaddmul, carry = addmul1_k("%r10", 0, "%rsp", 8*i, "%r9", k) blocki = blocki + codeaddmul blocki = blocki + " addq " + carry + ", " + offaddr("%rsp", 8*(k+i)) + "\n" blocki = blocki + " adcq $0, " + offaddr("%rsp", 8*(k+i+1)) + "\n" blocki = blocki + " movq " + offaddr("%rsi", 8*i) + ", %r9\n" codeaddmul, carry = addmul1_k("%r11", 0, "%rsp", 8*i, "%r9", k) blocki = blocki + codeaddmul blocki = blocki + " addq " + carry + ", " + offaddr("%rsp", 8*(k+i)) + "\n" blocki = blocki + " adcq $0, " + offaddr("%rsp", 8*(k+i+1)) + "\n" code = code + blocki final = "###########################################\n" final = final + "### Copy result in z\n" for i in range(0,k): final = final + " movq " + offaddr("%rsp", 8*(k+i)) + ", %rax\n" final = final + " movq %rax, " + offaddr("%rdi", 8*i) + "\n" final = final + " movq " + offaddr("%rsp", 16*k) + ", %rax # carry\n" final = final + " addq $" + str(8*(2*k+1)) + ", %rsp\n" final = final + " popq %rbx\n" final = final + " ret\n" code = code + final return code ##print addmul1_k("%rsi", 0, "%dsi", 0, "%r9", 3) k = int(sys.argv[1]) if k == 1: print """# # mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') GSYM_PREFIX`'mulredc1: # %r8 : inv_m # %rcx : m # %rdx : y # %rsi : x # %rdi : z movq %rdx, %rax mulq %rsi movq %rdx, %r10 movq %rax, %r9 # store xy in [r9:r10] mulq %r8 # compute u mulq %rcx # compute u*m addq %r9, %rax # rax is 0, now (carry is important) adcq %r10, %rdx movq %rdx, (%rdi) adcq $0, %rax ret """ else: print mulredc_k_rolled(k) ecm-6.4.4/x86_64/mulredc3.asm0000644023561000001540000002451312113421640012370 00000000000000# mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc3 TYPE(GSYM_PREFIX`'mulredc`'3,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc3: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $32, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 8(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 16(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 24(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2. Don't fetch new data from y[j+1]. movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 8(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 16(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 24(%TP) # Store CY in tmp[j+1] cmpq $3, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq %rax, 16(%ZP) movl %CYl, %eax # use carry as return value addq $32, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc.h0000644023561000001540000001240312106741272011757 00000000000000#ifndef __ASM_REDC_H__ #define __ASM_REDC_H__ #include "config.h" #include /* Signals that we have assembly code for 1xN mul/redc */ #define HAVE_NATIVE_MULREDC1_N /* Signals that we have assembly code for variable size redc */ #define HAVE_ASM_REDC3 /* Call the mulredc*() function with MS Windows parameter passing if WINDOWS64_ABI is defined. This is useful for testing the functions with Microsoft ABI under Linux */ #ifdef WINDOWS64_ABI #define MULREDC_ABI __attribute__((ms_abi)) #else #define MULREDC_ABI #endif extern void ecm_redc3(mp_limb_t *, const mp_limb_t *, mp_size_t, mp_limb_t) MULREDC_ABI; /* WARNING: the size-1 version doesn't take pointers in input */ extern mp_limb_t mulredc1(mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc2(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc3(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc4(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc5(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc6(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc7(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc8(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc9(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc10(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc11(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc12(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc13(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc14(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc15(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc16(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc17(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc18(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc19(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc20(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_2(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_3(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_4(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_5(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_6(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_7(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_8(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_9(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_10(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_11(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_12(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_13(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_14(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_15(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_16(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_17(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_18(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_19(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; extern mp_limb_t mulredc1_20(mp_limb_t *, const mp_limb_t, const mp_limb_t *, const mp_limb_t *, mp_limb_t) MULREDC_ABI; #endif ecm-6.4.4/x86_64/mulredc14.asm0000644023561000001540000010647212113421640012457 00000000000000# mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc14 TYPE(GSYM_PREFIX`'mulredc`'14,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc14: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $120, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 96(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 104(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 112(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13. Don't fetch new data from y[j+1]. movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 96(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 104(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 112(%TP) # Store CY in tmp[j+1] cmpq $14, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movl %CYl, %eax # use carry as return value addq $120, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc5.asm0000644023561000001540000003555712113421640012404 00000000000000# mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc5 TYPE(GSYM_PREFIX`'mulredc`'5,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc5: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $48, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 24(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 32(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 40(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4. Don't fetch new data from y[j+1]. movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 24(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 32(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 40(%TP) # Store CY in tmp[j+1] cmpq $5, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq %rax, 32(%ZP) movl %CYl, %eax # use carry as return value addq $48, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_8.asm0000644023561000001540000002514312113421641012616 00000000000000# mp_limb_t mulredc1_8(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_8 TYPE(GSYM_PREFIX`'mulredc1_`'8,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_8: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 48(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 56(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_13.asm0000644023561000001540000004073212113421641012673 00000000000000# mp_limb_t mulredc1_13(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_13 TYPE(GSYM_PREFIX`'mulredc1_`'13,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_13: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 88(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 96(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_11.asm0000644023561000001540000003416612113421641012675 00000000000000# mp_limb_t mulredc1_11(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_11 TYPE(GSYM_PREFIX`'mulredc1_`'11,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_11: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 72(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 80(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/generate_all0000755023561000001540000000026212106741272012523 00000000000000#!/bin/sh for i in 1 2; do ./autogen.py $i > mulredc$i.asm done for i in 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do m4 -DLENGTH=$i mulredc.m4 > mulredc$i.asm done ecm-6.4.4/x86_64/mulredc1_14.asm0000644023561000001540000004321712113421641012675 00000000000000# mp_limb_t mulredc1_14(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_14 TYPE(GSYM_PREFIX`'mulredc1_`'14,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_14: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 96(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 104(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc6.asm0000644023561000001540000004220112113421640012365 00000000000000# mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc6 TYPE(GSYM_PREFIX`'mulredc`'6,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc6: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $56, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 32(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 40(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 48(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5. Don't fetch new data from y[j+1]. movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 32(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 40(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 48(%TP) # Store CY in tmp[j+1] cmpq $6, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movl %CYl, %eax # use carry as return value addq $56, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc2.asm0000644023561000001540000002007112113421640012362 00000000000000# mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc2 TYPE(GSYM_PREFIX`'mulredc`'2,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc2: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $24, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 0(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 8(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 16(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1. Don't fetch new data from y[j+1]. movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 0(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 8(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 16(%TP) # Store CY in tmp[j+1] cmpq $2, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movl %CYl, %eax # use carry as return value addq $24, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc.m40000644023561000001540000003021212106741272012046 00000000000000`# mp_limb_t mulredc'LENGTH`(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,' # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. divert(-1) # forloop(i, from, to, stmt) define(`forloop', `pushdef(`$1', `$2')_forloop(`$1', `$2', `$3', `$4')popdef(`$1')') define(`_forloop', `ifelse(eval($1 <= `$3'), 1, `$4'`define(`$1', incr($1))_forloop(`$1', `$2', `$3', `$4')')') divert `include(`config.m4')' TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX``''mulredc`'LENGTH TYPE(GSYM_PREFIX``''mulredc``''LENGTH,``function'') # Implements multiplication and REDC for two input numbers of LENGTH words `ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI')' # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry `define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl' dnl Put overview of register allocation into .s file ``#'' `Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U' ``#'' ` `YP' = YP, `MP' = MP, `TP' = TP' # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 define(`LOCALSPACE', `eval(8*(LENGTH + 1))')dnl define(`LOCALTMP', `(%rsp)')dnl GSYM_PREFIX``''mulredc`'LENGTH: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 `ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl' `ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl' subq $LOCALSPACE, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea LOCALTMP, %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 `ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ') dnl' movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 `ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf')' dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 forloop(`UNROLL', 1, eval(LENGTH - 2), `dnl define(`J', `eval(8 * UNROLL)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl ``#'' Pass for j = UNROLL ``#'' Register values at entry: ``#'' %rax = y[j], %XI = x[i], %U = u ``#'' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined ``#'' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq J`'(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! `ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)')' mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, JM8`'(%TP) ``#'' Store T0 in tmp[UNROLL-1] movq J8`'(%YP), %rax ``#'' Fetch y[j+1] = y[eval(UNROLL+1)] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 ')dnl # end forloop ``#'' Pass for j = eval(LENGTH - 1). Don't fetch new data from y[j+1]. define(`J', `eval(8*LENGTH - 8)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq J`'(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, JM8`'(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, J`'(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, J8`'(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] `ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf')' dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 forloop(`UNROLL', 1, eval(LENGTH - 2), `dnl define(`J', `eval(8 * UNROLL)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl ``#'' Pass for j = UNROLL ``#'' Register values at entry: ``#'' %rax = y[j], %XI = x[i], %U = u ``#'' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in ``#'' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq J8`'(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq J`'(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq J8`'(%YP), %rax ``#'' Fetch y[j+1] = y[eval(UNROLL+1)] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, JM8`'(%TP) ``#'' Store T0 in tmp[UNROLL-1] dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 ')dnl # end forloop ``#'' Pass for j = eval(LENGTH - 1). Don't fetch new data from y[j+1]. define(`J', `eval(8*LENGTH - 8)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl movq J8`'(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq J`'(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, JM8`'(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, J`'(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, J8`'(%TP) # Store CY in tmp[j+1] cmpq $LENGTH, %I jb 1b # Copy result from tmp memory to z dnl ==== THIS LOOP WILL NOT WORK FOR LENGTH <= 1 ==== forloop(`UNROLL', 0, eval(LENGTH / 2 - 1), `dnl define(`J', `eval(2 * UNROLL * 8)')dnl define(`J8', `eval(J + 8)')dnl ifelse(J, `0', dnl ` movq (%TP), %rax', dnl ` movq J`'(%TP), %rax') movq J8`'(%TP), %rdx ifelse(J, `0', dnl ` movq %rax, (%ZP)', dnl ` movq %rax, J`'(%ZP)') movq %rdx, J8`'(%ZP) ')dnl ifelse(eval(LENGTH % 2), 1, `dnl define(`J', `eval(LENGTH * 8 - 8)')dnl movq J`'(%TP), %rax movq %rax, J`'(%ZP) ')dnl movl %CYl, %eax # use carry as return value addq $LOCALSPACE, %rsp `ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl' popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc20.asm0000644023561000001540000014200012113421641012440 00000000000000# mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc20 TYPE(GSYM_PREFIX`'mulredc`'20,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc20: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $168, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 128(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 136(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 128(%TP) `#' Store T0 in tmp[17-1] movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 18 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 144(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 136(%TP) `#' Store T0 in tmp[18-1] movq 152(%YP), %rax `#' Fetch y[j+1] = y[19] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 19. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 152(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 144(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 152(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 160(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 112(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 128(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 120(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 136(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 128(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 144(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 136(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 128(%TP) `#' Store T0 in tmp[17-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 18 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 152(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 144(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 152(%YP), %rax `#' Fetch y[j+1] = y[19] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 136(%TP) `#' Store T0 in tmp[18-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 19. Don't fetch new data from y[j+1]. movq 160(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 152(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 144(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 152(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 160(%TP) # Store CY in tmp[j+1] cmpq $20, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq 120(%TP), %rdx movq %rax, 112(%ZP) movq %rdx, 120(%ZP) movq 128(%TP), %rax movq 136(%TP), %rdx movq %rax, 128(%ZP) movq %rdx, 136(%ZP) movq 144(%TP), %rax movq 152(%TP), %rdx movq %rax, 144(%ZP) movq %rdx, 152(%ZP) movl %CYl, %eax # use carry as return value addq $168, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc17.asm0000644023561000001540000012423512113421640012457 00000000000000# mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc17 TYPE(GSYM_PREFIX`'mulredc`'17,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc17: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $144, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 128(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 120(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 128(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 136(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 112(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 128(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 120(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16. Don't fetch new data from y[j+1]. movq 136(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 128(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 120(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 128(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 136(%TP) # Store CY in tmp[j+1] cmpq $17, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq 120(%TP), %rdx movq %rax, 112(%ZP) movq %rdx, 120(%ZP) movq 128(%TP), %rax movq %rax, 128(%ZP) movl %CYl, %eax # use carry as return value addq $144, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_4.asm0000644023561000001540000001364712113421641012620 00000000000000# mp_limb_t mulredc1_4(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_4 TYPE(GSYM_PREFIX`'mulredc1_`'4,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_4: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 16(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 24(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_3.asm0000644023561000001540000001137012113421641012606 00000000000000# mp_limb_t mulredc1_3(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_3 TYPE(GSYM_PREFIX`'mulredc1_`'3,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_3: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 8(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 16(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_20.asm0000644023561000001540000006131512113421641012671 00000000000000# mp_limb_t mulredc1_20(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_20 TYPE(GSYM_PREFIX`'mulredc1_`'20,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_20: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 104(ZP) # Store T0 in z[14-1] movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 15 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 120(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 112(ZP) # Store T0 in z[15-1] movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 16 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 128(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 120(ZP) # Store T0 in z[16-1] movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 17 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 136(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 128(ZP) # Store T0 in z[17-1] movq 144(YP), %rax # Fetch y[j+1] = y[18] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 18 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 144(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 136(ZP) # Store T0 in z[18-1] movq 152(YP), %rax # Fetch y[j+1] = y[19] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 19. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 152(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 144(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 152(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_9.asm0000644023561000001540000002742212113421641012621 00000000000000# mp_limb_t mulredc1_9(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_9 TYPE(GSYM_PREFIX`'mulredc1_`'9,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_9: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 56(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 64(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_15.asm0000644023561000001540000004550412113421641012677 00000000000000# mp_limb_t mulredc1_15(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_15 TYPE(GSYM_PREFIX`'mulredc1_`'15,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_15: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 104(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 112(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc18.asm0000644023561000001540000013067612113421640012466 00000000000000# mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc18 TYPE(GSYM_PREFIX`'mulredc`'18,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc18: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $152, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 128(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 136(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 128(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 136(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 144(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 112(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 128(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 120(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 136(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 128(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17. Don't fetch new data from y[j+1]. movq 144(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 136(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 128(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 136(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 144(%TP) # Store CY in tmp[j+1] cmpq $18, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq 120(%TP), %rdx movq %rax, 112(%ZP) movq %rdx, 120(%ZP) movq 128(%TP), %rax movq 136(%TP), %rdx movq %rax, 128(%ZP) movq %rdx, 136(%ZP) movl %CYl, %eax # use carry as return value addq $152, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_18.asm0000644023561000001540000005454312113421641012705 00000000000000# mp_limb_t mulredc1_18(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_18 TYPE(GSYM_PREFIX`'mulredc1_`'18,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_18: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 104(ZP) # Store T0 in z[14-1] movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 15 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 120(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 112(ZP) # Store T0 in z[15-1] movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 16 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 128(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 120(ZP) # Store T0 in z[16-1] movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 17. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 136(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 128(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 136(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc19.asm0000644023561000001540000013533712113421641012467 00000000000000# mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc19 TYPE(GSYM_PREFIX`'mulredc`'19,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc19: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $160, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 112(%TP) `#' Store T0 in tmp[15-1] movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 128(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 120(%TP) `#' Store T0 in tmp[16-1] movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 136(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 128(%TP) `#' Store T0 in tmp[17-1] movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 18. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 144(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 136(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 144(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 152(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 112(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 128(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 120(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 128(%YP), %rax `#' Fetch y[j+1] = y[16] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 112(%TP) `#' Store T0 in tmp[15-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 16 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 136(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 128(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 136(%YP), %rax `#' Fetch y[j+1] = y[17] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 120(%TP) `#' Store T0 in tmp[16-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 17 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 144(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 136(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 144(%YP), %rax `#' Fetch y[j+1] = y[18] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 128(%TP) `#' Store T0 in tmp[17-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 18. Don't fetch new data from y[j+1]. movq 152(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 144(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 136(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 144(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 152(%TP) # Store CY in tmp[j+1] cmpq $19, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq 120(%TP), %rdx movq %rax, 112(%ZP) movq %rdx, 120(%ZP) movq 128(%TP), %rax movq 136(%TP), %rdx movq %rax, 128(%ZP) movq %rdx, 136(%ZP) movq 144(%TP), %rax movq %rax, 144(%ZP) movl %CYl, %eax # use carry as return value addq $160, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_10.asm0000644023561000001540000003170512113421641012670 00000000000000# mp_limb_t mulredc1_10(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_10 TYPE(GSYM_PREFIX`'mulredc1_`'10,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_10: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 64(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 72(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_16.asm0000644023561000001540000004777112113421641012710 00000000000000# mp_limb_t mulredc1_16(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_16 TYPE(GSYM_PREFIX`'mulredc1_`'16,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_16: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 104(ZP) # Store T0 in z[14-1] movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 15. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 120(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 112(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 120(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1.m40000644023561000001540000001320112106741272012126 00000000000000`# mp_limb_t mulredc1_'LENGTH`(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y,' # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored divert(-1)dnl # forloop(i, from, to, stmt)dnl define(`forloop', `pushdef(`$1', `$2')_forloop(`$1', `$2', `$3', `$4')popdef(`$1')')dnl define(`_forloop', `ifelse(eval($1 <= `$3'), 1, `$4'`define(`$1', incr($1))_forloop(`$1', `$2', `$3', `$4')')')dnl divert `include(`config.m4')' `ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl' TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX``''mulredc1_`'LENGTH TYPE(GSYM_PREFIX``''mulredc1_``''LENGTH,``function'') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word `ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI')' # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry `define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z' dnl Put overview of register allocation into generated code ``#'' Register vars: ``T0'' = T0, ``T1'' = T1, ``CY'' = CY, ``X'' = X, ``U'' = U ``#'' ``YP'' = YP, ``MP'' = MP GSYM_PREFIX``''mulredc1_`'LENGTH: ######################################################################### # i = 0 pass ######################################################################### ``#'' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 `ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' )' xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 `ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ')' dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 forloop(`UNROLL', 1, eval(LENGTH - 2), `dnl define(`J', `eval(8 * UNROLL)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl `#' Pass for j = UNROLL `#' Register values at entry: `#' %rax = y[j], X = x, U = u `#' T0 = value to store in tmp[j], T1 undefined `#' CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq J`'(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! `ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`')' mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, JM8`'(ZP) `#' Store T0 in z[UNROLL-1] movq J8`'(YP), %rax `#' Fetch y[j+1] = y[eval(UNROLL+1)] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 dnl Cycle ring buffer. Only mappings of T0 and T1 to regs change, no MOVs! `define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl' ``#'' Now ``T0'' = T0, ``T1'' = T1 ')dnl # end forloop `#' Pass for j = eval(LENGTH - 1). Don't fetch new data from y[j+1]. define(`J', `eval(8*LENGTH - 8)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq J`'(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, JM8`'(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, J`'(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value `ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl' popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc9.asm0000644023561000001540000005766712113421640012416 00000000000000# mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc9 TYPE(GSYM_PREFIX`'mulredc`'9,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc9: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $80, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 56(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 64(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 72(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8. Don't fetch new data from y[j+1]. movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 56(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 64(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 72(%TP) # Store CY in tmp[j+1] cmpq $9, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq %rax, 64(%ZP) movl %CYl, %eax # use carry as return value addq $80, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_12.asm0000644023561000001540000003645012113421641012674 00000000000000# mp_limb_t mulredc1_12(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_12 TYPE(GSYM_PREFIX`'mulredc1_`'12,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_12: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 80(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 88(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc13.asm0000644023561000001540000010203112113421640012441 00000000000000# mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc13 TYPE(GSYM_PREFIX`'mulredc`'13,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc13: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $112, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 88(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 96(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 104(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12. Don't fetch new data from y[j+1]. movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 88(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 96(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 104(%TP) # Store CY in tmp[j+1] cmpq $13, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq %rax, 96(%ZP) movl %CYl, %eax # use carry as return value addq $112, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc12.asm0000644023561000001540000007537612113421640012465 00000000000000# mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc12 TYPE(GSYM_PREFIX`'mulredc`'12,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc12: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $104, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 80(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 88(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 96(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11. Don't fetch new data from y[j+1]. movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 80(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 88(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 96(%TP) # Store CY in tmp[j+1] cmpq $12, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movl %CYl, %eax # use carry as return value addq $104, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_19.asm0000644023561000001540000005703012113421641012700 00000000000000# mp_limb_t mulredc1_19(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_19 TYPE(GSYM_PREFIX`'mulredc1_`'19,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_19: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 104(ZP) # Store T0 in z[14-1] movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 15 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 120(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 112(ZP) # Store T0 in z[15-1] movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 16 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 128(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 120(ZP) # Store T0 in z[16-1] movq 136(YP), %rax # Fetch y[j+1] = y[17] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 17 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 136(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 128(ZP) # Store T0 in z[17-1] movq 144(YP), %rax # Fetch y[j+1] = y[18] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 18. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 144(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 136(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 144(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_17.asm0000644023561000001540000005225612113421641012703 00000000000000# mp_limb_t mulredc1_17(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_17 TYPE(GSYM_PREFIX`'mulredc1_`'17,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_17: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 40(ZP) # Store T0 in z[6-1] movq 56(YP), %rax # Fetch y[j+1] = y[7] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 7 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 56(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 48(ZP) # Store T0 in z[7-1] movq 64(YP), %rax # Fetch y[j+1] = y[8] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 8 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 64(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 56(ZP) # Store T0 in z[8-1] movq 72(YP), %rax # Fetch y[j+1] = y[9] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 9 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 72(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 64(ZP) # Store T0 in z[9-1] movq 80(YP), %rax # Fetch y[j+1] = y[10] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 10 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 80(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 72(ZP) # Store T0 in z[10-1] movq 88(YP), %rax # Fetch y[j+1] = y[11] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 11 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 88(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 80(ZP) # Store T0 in z[11-1] movq 96(YP), %rax # Fetch y[j+1] = y[12] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 12 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 96(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 88(ZP) # Store T0 in z[12-1] movq 104(YP), %rax # Fetch y[j+1] = y[13] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 13 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 104(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 96(ZP) # Store T0 in z[13-1] movq 112(YP), %rax # Fetch y[j+1] = y[14] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 14 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 112(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 104(ZP) # Store T0 in z[14-1] movq 120(YP), %rax # Fetch y[j+1] = y[15] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 15 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 120(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 112(ZP) # Store T0 in z[15-1] movq 128(YP), %rax # Fetch y[j+1] = y[16] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 16. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 128(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 120(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 128(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc16.asm0000644023561000001540000011757412113421640012466 00000000000000# mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc16 TYPE(GSYM_PREFIX`'mulredc`'16,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc16: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $136, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 104(%TP) `#' Store T0 in tmp[14-1] movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 112(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 120(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 128(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 112(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 120(%YP), %rax `#' Fetch y[j+1] = y[15] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 104(%TP) `#' Store T0 in tmp[14-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 15. Don't fetch new data from y[j+1]. movq 128(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 120(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 112(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 120(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 128(%TP) # Store CY in tmp[j+1] cmpq $16, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq 120(%TP), %rdx movq %rax, 112(%ZP) movq %rdx, 120(%ZP) movl %CYl, %eax # use carry as return value addq $136, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1.asm0000644023561000001540000000322612106741272012374 00000000000000# # mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y, # const mp_limb_t m, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. include(`config.m4') TEXT GLOBL GSYM_PREFIX`'mulredc1 TYPE(GSYM_PREFIX`'mulredc1,`function') ifdef(`WINDOWS64_ABI', # stack: inv_m, %r9: m, %r8: y, %rdx: x, %rcx: *z `define(`INV_M', `0x28(%rsp)') define(`M', `%r9') define(`Y', `%r8') define(`X', `%rdx') define(`Z', `%rcx') define(`TMP2', `%r10') define(`TMP1', `%r8')', # %r8: inv_m, %rcx: m, %rdx: y, %rsi : x, %rdi : *z `define(`INV_M', `%r8') define(`M', `%rcx') define(`Y', `%rdx') define(`X', `%rsi') define(`Z', `%rdi') define(`TMP2', `%r10') define(`TMP1', `%r9')') GSYM_PREFIX`'mulredc1: movq Y, %rax mulq X movq %rdx, TMP2 movq %rax, TMP1 # store xy in [r9:r10] mulq INV_M # compute u mulq M # compute u*m addq TMP1, %rax # rax is 0, now (carry is important) ifdef(`WANT_ASSERT', ` jz 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx call abort@plt LABEL_SUFFIX(1)') adcq TMP2, %rdx movq %rdx, (Z) adcq $0, %rax ret ecm-6.4.4/x86_64/mulredc1_5.asm0000644023561000001540000001612612113421641012614 00000000000000# mp_limb_t mulredc1_5(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_5 TYPE(GSYM_PREFIX`'mulredc1_`'5,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_5: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 24(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 32(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc15.asm0000644023561000001540000011313312113421640012450 00000000000000# mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc15 TYPE(GSYM_PREFIX`'mulredc`'15,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc15: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $128, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 72(%TP) `#' Store T0 in tmp[10-1] movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 88(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 80(%TP) `#' Store T0 in tmp[11-1] movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 96(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 88(%TP) `#' Store T0 in tmp[12-1] movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 104(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 96(%TP) `#' Store T0 in tmp[13-1] movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 104(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 112(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 120(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 80(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 88(%YP), %rax `#' Fetch y[j+1] = y[11] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 72(%TP) `#' Store T0 in tmp[10-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 11 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 96(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 88(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 96(%YP), %rax `#' Fetch y[j+1] = y[12] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 80(%TP) `#' Store T0 in tmp[11-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 12 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 104(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 96(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 104(%YP), %rax `#' Fetch y[j+1] = y[13] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 88(%TP) `#' Store T0 in tmp[12-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 13 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 112(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 104(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 112(%YP), %rax `#' Fetch y[j+1] = y[14] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 96(%TP) `#' Store T0 in tmp[13-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 14. Don't fetch new data from y[j+1]. movq 120(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 112(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 104(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 112(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 120(%TP) # Store CY in tmp[j+1] cmpq $15, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq 88(%TP), %rdx movq %rax, 80(%ZP) movq %rdx, 88(%ZP) movq 96(%TP), %rax movq 104(%TP), %rdx movq %rax, 96(%ZP) movq %rdx, 104(%ZP) movq 112(%TP), %rax movq %rax, 112(%ZP) movl %CYl, %eax # use carry as return value addq $128, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_6.asm0000644023561000001540000002040512113421641012610 00000000000000# mp_limb_t mulredc1_6(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_6 TYPE(GSYM_PREFIX`'mulredc1_`'6,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_6: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 32(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 40(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc1_7.asm0000644023561000001540000002266412113421641012622 00000000000000# mp_limb_t mulredc1_7(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_7 TYPE(GSYM_PREFIX`'mulredc1_`'7,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_7: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 0(ZP) # Store T0 in z[1-1] movq 16(YP), %rax # Fetch y[j+1] = y[2] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 2 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 16(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 8(ZP) # Store T0 in z[2-1] movq 24(YP), %rax # Fetch y[j+1] = y[3] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 3 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 24(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 16(ZP) # Store T0 in z[3-1] movq 32(YP), %rax # Fetch y[j+1] = y[4] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 4 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 32(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 24(ZP) # Store T0 in z[4-1] movq 40(YP), %rax # Fetch y[j+1] = y[5] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 5 # Register values at entry: # %rax = y[j], X = x, U = u # T0 = value to store in tmp[j], T1 undefined # CY = carry into T1 (is <= 2) # We have CY:T1 <= 2 * 2^64 - 2 movq CY, T1 # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq X # y[j] * x # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, T0 # Add low word to T0 movq 40(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 1f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt 1: ',`') mulq U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq T0, %rax # Add T0 and low word movq %rax, 32(ZP) # Store T0 in z[5-1] movq 48(YP), %rax # Fetch y[j+1] = y[6] into %rax adcq %rdx, T1 # Add high word with carry to T1 setc CYb # CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 6. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 48(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 40(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 48(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/mulredc10.asm0000644023561000001540000006431612113421640012453 00000000000000# mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc10 TYPE(GSYM_PREFIX`'mulredc`'10,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc10: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $88, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 64(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 72(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 80(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9. Don't fetch new data from y[j+1]. movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 64(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 72(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 80(%TP) # Store CY in tmp[j+1] cmpq $10, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movl %CYl, %eax # use carry as return value addq $88, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc1_2.asm0000644023561000001540000000711112113421641012603 00000000000000# mp_limb_t mulredc1_2(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored include(`config.m4') ifdef(`WINDOWS64_ABI', `define(`Y_PARAM', `%r8')dnl define(`INVM_PARAM',`72(%rsp)')dnl' , `define(`Y_PARAM', `%rdx')dnl define(`INVM_PARAM',`%r8')dnl' )dnl TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc1_2 TYPE(GSYM_PREFIX`'mulredc1_`'2,`function') # Implements multiplication and REDC for one input numbers of LENGTH words # and a multiplier of one word ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # YP = y, MP = m, # X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `%rsi')dnl define(`T1', `%rbx')dnl define(`CY', `%rcx')dnl define(`CYl', `%ecx')dnl define(`CYb', `%cl')dnl define(`X', `%r14')dnl # register that holds x value define(`U', `%r11')dnl define(`YP', `%r9')dnl # register that points to the y array define(`MP', `%r10')dnl # register that points to the m array define(`ZP', `%rdi')dnl # register that holds z `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U `#' `YP' = YP, `MP' = MP GSYM_PREFIX`'mulredc1_2: ######################################################################### # i = 0 pass ######################################################################### `#' register values at loop entry: YP = y, MP = m # We need to compute u movq (Y_PARAM), %rax # rax = y[0] (time critical, do first) pushq %rbx pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi movq %r9, MP # store m in MP movq Y_PARAM, YP movq %rcx, ZP movq %rdx, X' , ` movq Y_PARAM, YP movq %rcx, MP movq %rsi, X # store x in X # ZP is same as passed in' ) xorl CYl, CYl # set %CY to 0 mulq X # rdx:rax = y[0] * x movq %rax, T0 # Move low word of product to T0 movq %rdx, T1 # Move high word of product to T1 imulq INVM_PARAM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, U # this is the new u value mulq (MP) # multipy u*m[0] addq %rax, T0 # Now %T0 = 0, need not be stored movq 8(YP), %rax # Fetch y[1] adcq %rdx, T1 # setc CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq T0, T0 jz assert1 lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt assert1: popf ') define(`TT', defn(`T0'))dnl define(`T0', defn(`T1'))dnl define(`T1', defn(`TT'))dnl undefine(`TT')dnl `#' Now `T0' = T0, `T1' = T1 # Pass for j = 1. Don't fetch new data from y[j+1]. movq CY, T1 # T1 = CY <= 1 mulq X # y[j] * x[i] addq %rax, T0 # Add low word to T0 movq 8(MP), %rax # Fetch m[j] into %rax adcq %rdx, T1 # Add high word with carry to T1 mulq U # m[j]*u addq %rax, T0 # Add low word to T0 movq T0, 0(ZP) # Store T0 in z[j-1] adcq %rdx, T1 # Add high word with carry to T1 movq T1, 8(ZP) # Store T1 in tmp[j] setc CYb # %CY <= 1 movq CY, %rax # use carry as return value ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %rbx ret ecm-6.4.4/x86_64/Makefile.am0000644023561000001540000000745512106741272012222 00000000000000MULREDC = mulredc1.asm mulredc2.asm mulredc3.asm mulredc4.asm mulredc5.asm mulredc6.asm \ mulredc7.asm mulredc8.asm mulredc9.asm mulredc10.asm mulredc11.asm \ mulredc12.asm mulredc13.asm mulredc14.asm mulredc15.asm mulredc16.asm \ mulredc17.asm mulredc18.asm mulredc19.asm mulredc20.asm MULREDC1 = mulredc1_2.asm mulredc1_3.asm mulredc1_4.asm mulredc1_5.asm \ mulredc1_6.asm mulredc1_7.asm mulredc1_8.asm mulredc1_9.asm mulredc1_10.asm \ mulredc1_11.asm mulredc1_12.asm mulredc1_13.asm mulredc1_14.asm \ mulredc1_15.asm mulredc1_16.asm mulredc1_17.asm mulredc1_18.asm \ mulredc1_19.asm mulredc1_20.asm EXTRA_DIST = autogen.py generate_all mulredc.m4 mulredc1.m4 noinst_LTLIBRARIES = libmulredc.la noinst_HEADERS = mulredc.h # This library definition also causes the mulredc[n].asm, mulredc1_[n].asm, # and redc.asm files to go in the distribution - no need for having # them in EXTRA_DIST # redc.asm is removed, is slower than GMP and not ported to Win64 ABI libmulredc_la_SOURCES = $(MULREDC) $(MULREDC1) # It's actually the .s files that depend on config.m4, but automake # knows them only as intermediate files, not as targets. Adding the # dependency to libmulredc.la should work so long as no stale .s # files exist. libmulredc_la_DEPENDENCIES = $(top_builddir)/config.m4 # The asm code does not depend on any libraries except libc for abort() # if assertions are enabled LIBS = LDFLAGS = # There has to be a way of making this automatically mulredc2.asm: mulredc.m4 $(M4) -DLENGTH=2 $< > $@ mulredc3.asm: mulredc.m4 $(M4) -DLENGTH=3 $< > $@ mulredc4.asm: mulredc.m4 $(M4) -DLENGTH=4 $< > $@ mulredc5.asm: mulredc.m4 $(M4) -DLENGTH=5 $< > $@ mulredc6.asm: mulredc.m4 $(M4) -DLENGTH=6 $< > $@ mulredc7.asm: mulredc.m4 $(M4) -DLENGTH=7 $< > $@ mulredc8.asm: mulredc.m4 $(M4) -DLENGTH=8 $< > $@ mulredc9.asm: mulredc.m4 $(M4) -DLENGTH=9 $< > $@ mulredc10.asm: mulredc.m4 $(M4) -DLENGTH=10 $< > $@ mulredc11.asm: mulredc.m4 $(M4) -DLENGTH=11 $< > $@ mulredc12.asm: mulredc.m4 $(M4) -DLENGTH=12 $< > $@ mulredc13.asm: mulredc.m4 $(M4) -DLENGTH=13 $< > $@ mulredc14.asm: mulredc.m4 $(M4) -DLENGTH=14 $< > $@ mulredc15.asm: mulredc.m4 $(M4) -DLENGTH=15 $< > $@ mulredc16.asm: mulredc.m4 $(M4) -DLENGTH=16 $< > $@ mulredc17.asm: mulredc.m4 $(M4) -DLENGTH=17 $< > $@ mulredc18.asm: mulredc.m4 $(M4) -DLENGTH=18 $< > $@ mulredc19.asm: mulredc.m4 $(M4) -DLENGTH=19 $< > $@ mulredc20.asm: mulredc.m4 $(M4) -DLENGTH=20 $< > $@ mulredc1_2.asm: mulredc1.m4 $(M4) -DLENGTH=2 $< > $@ mulredc1_3.asm: mulredc1.m4 $(M4) -DLENGTH=3 $< > $@ mulredc1_4.asm: mulredc1.m4 $(M4) -DLENGTH=4 $< > $@ mulredc1_5.asm: mulredc1.m4 $(M4) -DLENGTH=5 $< > $@ mulredc1_6.asm: mulredc1.m4 $(M4) -DLENGTH=6 $< > $@ mulredc1_7.asm: mulredc1.m4 $(M4) -DLENGTH=7 $< > $@ mulredc1_8.asm: mulredc1.m4 $(M4) -DLENGTH=8 $< > $@ mulredc1_9.asm: mulredc1.m4 $(M4) -DLENGTH=9 $< > $@ mulredc1_10.asm: mulredc1.m4 $(M4) -DLENGTH=10 $< > $@ mulredc1_11.asm: mulredc1.m4 $(M4) -DLENGTH=11 $< > $@ mulredc1_12.asm: mulredc1.m4 $(M4) -DLENGTH=12 $< > $@ mulredc1_13.asm: mulredc1.m4 $(M4) -DLENGTH=13 $< > $@ mulredc1_14.asm: mulredc1.m4 $(M4) -DLENGTH=14 $< > $@ mulredc1_15.asm: mulredc1.m4 $(M4) -DLENGTH=15 $< > $@ mulredc1_16.asm: mulredc1.m4 $(M4) -DLENGTH=16 $< > $@ mulredc1_17.asm: mulredc1.m4 $(M4) -DLENGTH=17 $< > $@ mulredc1_18.asm: mulredc1.m4 $(M4) -DLENGTH=18 $< > $@ mulredc1_19.asm: mulredc1.m4 $(M4) -DLENGTH=19 $< > $@ mulredc1_20.asm: mulredc1.m4 $(M4) -DLENGTH=20 $< > $@ .asm.s: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.s # Nothing here needs the C preprocessor, and including this rule causes # "make" to build .S, then .s files which fails on case-insensitive # filesystems #.asm.S: # $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.S ecm-6.4.4/x86_64/mulredc4.asm0000644023561000001540000003113512113421640012367 00000000000000# mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc4 TYPE(GSYM_PREFIX`'mulredc`'4,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc4: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $40, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 16(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 24(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 32(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3. Don't fetch new data from y[j+1]. movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 16(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 24(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 32(%TP) # Store CY in tmp[j+1] cmpq $4, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movl %CYl, %eax # use carry as return value addq $40, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc8.asm0000644023561000001540000005324512113421640012401 00000000000000# mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc8 TYPE(GSYM_PREFIX`'mulredc`'8,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc8: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $72, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 48(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 56(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 64(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7. Don't fetch new data from y[j+1]. movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 48(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 56(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 64(%TP) # Store CY in tmp[j+1] cmpq $8, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movl %CYl, %eax # use carry as return value addq $72, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/README0000644023561000001540000000146512106741272011041 00000000000000mulredc[1..20].asm are size-specific asm functions for mulredc. These are generated by the Python autogen.py script (old version, still used for sizes 1 and 2) and the m4 script mulredc.m4 (all other sizes). In order to avoid dependency on the Python and m4 packages, this generation is not done automatically with the autoconf/automake stuff. If you need to regenerate them, the syntax is ./autogen.py 1 > mulredc1.asm ./autogen.py 2 > mulredc2.asm m4 -DLENGTH=3 mulredc.m4 > mulredc3.asm m4 -DLENGTH=4 mulredc.m4 > mulredc4.asm etc., up to LENGTH=20. If you have problems, you should reconfigure with the --disable-asm-redc option. redc.asm is a version of redc separated from the multiplication, since there are cases where it is needed. test_mulredc.c, bench.c and the Makefile.dev are for development.ecm-6.4.4/x86_64/mulredc11.asm0000644023561000001540000007074412113421640012456 00000000000000# mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc11 TYPE(GSYM_PREFIX`'mulredc`'11,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc11: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $96, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 40(%TP) `#' Store T0 in tmp[6-1] movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 56(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 48(%TP) `#' Store T0 in tmp[7-1] movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 64(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 56(%TP) `#' Store T0 in tmp[8-1] movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 72(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 64(%TP) `#' Store T0 in tmp[9-1] movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 72(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 80(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 88(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 48(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 56(%YP), %rax `#' Fetch y[j+1] = y[7] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 40(%TP) `#' Store T0 in tmp[6-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 7 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 64(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 56(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 64(%YP), %rax `#' Fetch y[j+1] = y[8] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 48(%TP) `#' Store T0 in tmp[7-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 8 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 72(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 64(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 72(%YP), %rax `#' Fetch y[j+1] = y[9] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 56(%TP) `#' Store T0 in tmp[8-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 9 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 80(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 72(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 80(%YP), %rax `#' Fetch y[j+1] = y[10] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 64(%TP) `#' Store T0 in tmp[9-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 10. Don't fetch new data from y[j+1]. movq 88(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 80(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 72(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 80(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 88(%TP) # Store CY in tmp[j+1] cmpq $11, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq 56(%TP), %rdx movq %rax, 48(%ZP) movq %rdx, 56(%ZP) movq 64(%TP), %rax movq 72(%TP), %rdx movq %rax, 64(%ZP) movq %rdx, 72(%ZP) movq 80(%TP), %rax movq %rax, 80(%ZP) movl %CYl, %eax # use carry as return value addq $96, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/x86_64/mulredc7.asm0000644023561000001540000004662312113421640012402 00000000000000# mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_limb_t inv_m); # # Linux: z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8 # Needs %rbx, %rsp, %rbp, %r12-%r15 restored # Windows: z: %rcx, x: %rdx, y: %r8, m: %r9, inv_m: 28(%rsp) # Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored # This stuff is run through M4 twice, first when generating the # mulredc*.asm files from the mulredc.m4 file (when preparing the distro) # and again when generating the mulredc*.s files from the mulredc*.asm files # when the user compiles the program. # We used to substitute XP etc. by register names in the first pass, # but now with switching between Linux and Windows ABI, we do it in # the second pass instead when we know which ABI we have, as that # allows us to assign registers differently for the two ABIs. # That means that the defines for XP etc., need to be quoted once to be # protected in the first M4 pass, so that they are processed and # occurrences of XP etc. happen only in the second pass. include(`config.m4') TEXT .align 64 # Opteron L1 code cache line is 64 bytes long GLOBL GSYM_PREFIX`'mulredc7 TYPE(GSYM_PREFIX`'mulredc`'7,`function') # Implements multiplication and REDC for two input numbers of LENGTH words ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI') # tmp[0 ... len+1] = 0 # for (i = 0; i < len; i++) # { # t = x[i] * y[0]; /* Keep and reuse this product */ # u = ((t + tmp[0]) * invm) % 2^64 # tmp[0] += (t + m[0]*u) / 2^64; /* put carry in cy. */ # for (j = 1; j < len; j++) # { # tmp[j-1 ... j] += x[i]*y[j] + m[j]*u + (cy << BITS_PER_WORD); # /* put new carry in cy */ # } # tmp[len] = cy; # } # z[0 ... len-1] = tmp[0 ... len-1] # return (tmp[len]) # Values that are referenced only once in the loop over j go into r8 .. r14, # In the inner loop (over j), tmp, x[i], y, m, and u are constant. # tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values # stay in registers and are referenced as # TP = tmp, YP = y, MP = m, # XI = x[i], T0 = tmp[j], T1 = tmp[j+1], CY = carry define(`T0', `rsi')dnl define(`T0l', `esi')dnl define(`T1', `rbx')dnl define(`T1l', `ebx')dnl define(`CY', `rcx')dnl define(`CYl', `ecx')dnl define(`CYb', `cl')dnl define(`XI', `r14')dnl # register that holds x[i] value define(`U', `r11')dnl define(`XP', `r13')dnl # register that points to the x arraz define(`TP', `rbp')dnl # register that points to t + i define(`I', `r12')dnl # register that holds loop counter i define(`Il', `r12d')dnl # register that holds loop counter i define(`ZP', `rdi')dnl # register that holds z. Same as passed in ifdef(`WINDOWS64_ABI', `define(`YP', `r8')dnl # points to y array, same as passed in define(`MP', `r9')dnl # points to m array, same as passed in define(`INVM', `r10')dnl # register that holds invm. Same as passed in' , `define(`YP', `r9')dnl # register that points to the y array define(`MP', `r10')dnl # register that points to the m array define(`INVM', `r8')dnl # register that holds invm. Same as passed in' )dnl `#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `XI' = XI, `U' = U `#' `YP' = YP, `MP' = MP, `TP' = TP # local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words # The tmp array needs LENGTH+1 entries, the last one is so that we can # store CY at tmp[j+1] for j == len-1 GSYM_PREFIX`'mulredc7: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 ifdef(`WINDOWS64_ABI', ` pushq %rsi pushq %rdi ') dnl ifdef(`WINDOWS64_ABI', ` movq %rdx, %XP movq %rcx, %ZP movq 96(%rsp), %INVM # 7 push, ret addr, 4 reg vars = 96 bytes' , ` movq %rsi, %XP # store x in XP movq %rdx, %YP # store y in YP movq %rcx, %MP # store m in MP' ) dnl subq $64, %rsp # subtract size of local vars ######################################################################### # i = 0 pass ######################################################################### # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be != 0) # Pass for j = 0. We need to fetch x[i] from memory and compute the new u movq (%XP), %XI # XI = x[0] movq (%YP), %rax # rax = y[0] xorl %CYl, %CYl # set %CY to 0 lea (%rsp), %TP # store addr of tmp array in TP movl %CYl, %Il # Set %I to 0 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I movq %rax, %T0 # Move low word of product to T0 movq %rdx, %T1 # Move high word of product to T1 ifdef(`MULREDC_SVOBODA', , `' ` imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64' ) movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored movq 8(%YP), %rax # Fetch y[1] adcq %rdx, %T1 # setc %CYb # CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence # CY:T1 <= 2*2^64 - 4 ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 2f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(2) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 8(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 0(%TP) `#' Store T0 in tmp[1-1] movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 16(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 8(%TP) `#' Store T0 in tmp[2-1] movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 24(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 16(%TP) `#' Store T0 in tmp[3-1] movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 32(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 24(%TP) `#' Store T0 in tmp[4-1] movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 undefined `#' %CY = carry into T1 (is <= 2) # We have %CY:%T1 <= 2 * 2^64 - 2 movl %CYl, %T1l # T1 = CY <= 1 # Here, T1:T0 <= 2*2^64 - 2 mulq %XI # y[j] * x[i] # rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1 addq %rax, %T0 # Add low word to T0 movq 40(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 # T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry! ifdef(`WANT_ASSERT', ` jnc 3f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(3)') mulq %U # m[j]*u # rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1 addq %T0, %rax # Add T0 and low word movq %rax, 32(%TP) `#' Store T0 in tmp[5-1] movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 # CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <= # 2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2 define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6. Don't fetch new data from y[j+1]. movl %CYl, %T1l # T1 = CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 40(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 48(%TP) # Store T1 in tmp[j] setc %CYb # %CY <= 1 movq %CY, 56(%TP) # Store CY in tmp[j+1] ######################################################################### # i > 0 passes ######################################################################### .align 32,,16 LABEL_SUFFIX(1) # register values at loop entry: %TP = tmp, %I = i, %YP = y, %MP = m # %CY < 255 (i.e. only low byte may be > 0) # Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory # and compute the new u movq (%XP,%I,8), %XI # XI = x[i] movq (%YP), %rax # rax = y[0] #init the register tmp ring buffer movq (%TP), %T0 # Load tmp[0] into T0 movq 8(%TP), %T1 # Load tmp[1] into T1 mulq %XI # rdx:rax = y[0] * x[i] addq $1, %I addq %T0, %rax # Add T0 to low word adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 movq %rax, %T0 # Save sum of low words in T0 imulq %INVM, %rax # %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64 movq %rax, %U # this is the new u value mulq (%MP) # multipy u*m[0] addq %rax, %T0 # Now %T0 = 0, need not be stored adcq %rdx, %T1 # movq 8(%YP), %rax # Fetch y[1] ifdef(`WANT_ASSERT', ` pushf testq %T0, %T0 jz 4f lea _GLOBAL_OFFSET_TABLE_(%rip), %rbx # if we do PIC code, we # need to set rbx; if not, it doesnt hurt call GSYM_PREFIX`'abort@plt LABEL_SUFFIX(4) popf') define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 1 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 16(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 8(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 16(%YP), %rax `#' Fetch y[j+1] = y[2] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 0(%TP) `#' Store T0 in tmp[1-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 2 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 24(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 16(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 24(%YP), %rax `#' Fetch y[j+1] = y[3] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 8(%TP) `#' Store T0 in tmp[2-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 3 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 32(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 24(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 32(%YP), %rax `#' Fetch y[j+1] = y[4] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 16(%TP) `#' Store T0 in tmp[3-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 4 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 40(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 32(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 40(%YP), %rax `#' Fetch y[j+1] = y[5] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 24(%TP) `#' Store T0 in tmp[4-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 5 `#' Register values at entry: `#' %rax = y[j], %XI = x[i], %U = u `#' %TP = tmp, %T0 = value to store in tmp[j], %T1 value to store in `#' tmp[j+1], %CY = carry into T1, carry flag: also carry into T1 movq 48(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] setc %CYb # %CY <= 1 mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq %U, %rax adcq %rdx, %T1 # Add high word with carry to T1 adcb $0, %CYb # %CY <= 2 mulq 40(%MP) # m[j]*u addq %rax, %T0 # Add T0 and low word movq 48(%YP), %rax `#' Fetch y[j+1] = y[6] into %rax adcq %rdx, %T1 # Add high word with carry to T1 movq %T0, 32(%TP) `#' Store T0 in tmp[5-1] define(`TT', defn(`T0'))dnl define(`TTl', defn(`T0l'))dnl define(`T0', defn(`T1'))dnl define(`T0l', defn(`T1l'))dnl define(`T1', defn(`TT'))dnl define(`T1l', defn(`TTl'))dnl undefine(`TT')dnl undefine(`TTl')dnl `#' Now `T0' = T0, `T1' = T1 `#' Pass for j = 6. Don't fetch new data from y[j+1]. movq 56(%TP), %T1 adcq %CY, %T1 # T1 = CY + tmp[j+1] mulq %XI # y[j] * x[i] addq %rax, %T0 # Add low word to T0 movq 48(%MP), %rax # Fetch m[j] into %rax adcq %rdx, %T1 # Add high word with carry to T1 setc %CYb # %CY <= 1 mulq %U # m[j]*u addq %rax, %T0 # Add low word to T0 movq %T0, 40(%TP) # Store T0 in tmp[j-1] adcq %rdx, %T1 # Add high word with carry to T1 movq %T1, 48(%TP) # Store T1 in tmp[j] adcb $0, %CYb # %CY <= 2 movq %CY, 56(%TP) # Store CY in tmp[j+1] cmpq $7, %I jb 1b # Copy result from tmp memory to z movq (%TP), %rax movq 8(%TP), %rdx movq %rax, (%ZP) movq %rdx, 8(%ZP) movq 16(%TP), %rax movq 24(%TP), %rdx movq %rax, 16(%ZP) movq %rdx, 24(%ZP) movq 32(%TP), %rax movq 40(%TP), %rdx movq %rax, 32(%ZP) movq %rdx, 40(%ZP) movq 48(%TP), %rax movq %rax, 48(%ZP) movl %CYl, %eax # use carry as return value addq $64, %rsp ifdef(`WINDOWS64_ABI', ` popq %rdi popq %rsi ') dnl popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret ecm-6.4.4/b1_ainc.c0000644023561000001540000002132512106741273010661 00000000000000/* Code to compute "Automatic calculated" B1 incrementation Copyright 2003, 2005, 2006 Jim Fougeron, Paul Zimmermann. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "ecm-ecm.h" #include /* * Version #2 function is the one we are using with a const * adjustment of 1.33 */ /* Version #1 and Version #3 are not being used, but they have been kept in the source, so that they can be refered to if and when changes are made */ double calc_B1_AutoIncrement_v3 (double cur_B1, double incB1val, int calcInc); double calc_B1_AutoIncrement_v1 (double cur_B1, double incB1val, int calcInc); /* Here is my "first" attempt at a B1 adjustment function. * Parameters: * cur_B1 the current B1 level * incB1val This is ether a constant or an "adjustment factor" * calcInc Tells whether incB1val is a constant, or whether we * should compute the optimal B1 adjustment, and then * "adjust" that optimal value based up incB1val, which * is then treadted as a scaling factor. * * Returns: the new B1 value to use. * * Assumption The return value is based upon providing the recommended * optimal number of curves for a "range" of B1's, then * computing the amount of adjustment needed to push B1 to * the next level. NOTE this may be too slow of a push. * If it optimal curves at 250000 is 500, using 500 curves * in an ever advancing B1 level from 250000 to the next * level (1e6) is considerably more work than the simple * 500 curves at B1=250000. It might be prudent to make * the adjustment deal with the low bound value, the high * bound value, and how far the current B1 is from the low * and the high B1 boundary. */ double calc_B1_AutoIncrement_v1 (double cur_B1, double incB1val, int calcInc) { double B1Mod; if (!calcInc) return cur_B1 + incB1val; /* incB1val is a constant to add to B1 */ /* This simple table was "created" based upon the "Optimal B1 table" in the README file */ if (cur_B1 < 2000.) B1Mod = 200.; else if (cur_B1 < 11000.) /* 30 curves from B1=2000 to 11000 */ B1Mod = 300.; else if (cur_B1 < 50000.) /* 90 curves from B1=11000 to 50000 */ B1Mod = 433.3334; else if (cur_B1 < 250000.) /* 240 curves from B1=50000 to 250000 */ B1Mod = 833.3334; else if (cur_B1 < 1000000.) /* 500 curves from B1=250000 to 1e6 */ B1Mod = 1500.; else if (cur_B1 < 3000000.) /* 1100 curves from B1=1e6 to 3e6 */ B1Mod = 1818.18182; else if (cur_B1 < 11000000.) /* 2900 curves from B1=3e6 to 11e6 */ B1Mod = 2758.621; else if (cur_B1 < 43000000.) /* 5500 curves from B1=11e6 to 43e6 */ B1Mod = 5818.18182; else if (cur_B1 < 110000000.) /* 9000 curves from B1=43e6 to 11e7 */ B1Mod = 7444.44445; else if (cur_B1 < 260000000.) /* 22000 curves from B1=11e7 to 26e7 */ B1Mod = 6818.18182; else if (cur_B1 < 850000000.) /* 52000 curves from B1=26e7 to 85e7 */ B1Mod = 11346.1539; else if (cur_B1 < 2900000000.) /* 83000 curves from B1=85e7 to 29e8 */ B1Mod = 24698.8; else B1Mod = 35000.; return floor (cur_B1 + (B1Mod*incB1val) + 0.5); } /* Here is my "second" attempt at a B1 adjustment function. * this version looks pretty good * * THIS is the version being used. */ double calc_B1_AutoIncrement (double cur_B1, double incB1val, int calcInc) { const double const_adj = 1.33; double B1Mod; if (!calcInc) return cur_B1 + incB1val; /* incB1val is a constant to add to B1 */ /* This simple table was "created" based upon the "Optimal B1 table" in the README file */ if (cur_B1 < 2000.) B1Mod = 200.; else if (cur_B1 < 11000.) /* 30 curves from B1=2000 to 11000 */ { B1Mod = 300. * (1. - ((cur_B1 - 2000.) / 9000.)); B1Mod +=433.334 * (1. - ((11000. - cur_B1) / 9000.)); } else if (cur_B1 < 50000.) /* 90 curves from B1=11000 to 50000 */ { B1Mod = 433.334 * (1. - ((cur_B1 - 11000.) / 39000.)); B1Mod +=833.334 * (1. - ((50000. - cur_B1) / 39000.)); } else if (cur_B1 < 250000.) /* 240 curves from B1=50000 to 250000 */ { B1Mod = 833.334 * (1. - ((cur_B1 - 50000.) / 200000.)); B1Mod +=1500. * (1. - ((250000. - cur_B1) / 200000.)); } else if (cur_B1 < 1000000.) /* 500 curves from B1=250000 to 1e6 */ { B1Mod = 1500. * (1. - ((cur_B1 - 250000.) / 750000.)); B1Mod +=1818.18182 * (1. - ((1000000. - cur_B1) / 750000.)); } else if (cur_B1 < 3000000.) /* 1100 curves from B1=1e6 to 3e6 */ { B1Mod = 1818.18182 * (1. - ((cur_B1 - 1000000.) / 2000000.)); B1Mod +=2758.621 * (1. - ((3000000. - cur_B1) / 2000000.)); } else if (cur_B1 < 11000000.) /* 2900 curves from B1=3e6 to 11e6 */ { B1Mod = 2758.621 * (1. - ((cur_B1 - 3000000.) / 8000000.)); B1Mod +=5818.18182 * (1. - ((11000000. - cur_B1) / 8000000.)); } else if (cur_B1 < 43000000.) /* 5500 curves from B1=11e6 to 43e6 */ { B1Mod = 5818.18182 * (1. - ((cur_B1 - 11000000.) / 32000000.)); B1Mod +=7444.44445 * (1. - ((43000000. - cur_B1) / 32000000.)); } else if (cur_B1 < 110000000.) /* 9000 curves from B1=43e6 to 11e7 */ { B1Mod = 7444.44445 * (1. - ((cur_B1 - 43000000.) / 67000000.)); B1Mod +=6818.18182 * (1. - ((110000000. - cur_B1) / 67000000.)); } else if (cur_B1 < 260000000.) /* 22000 curves from B1=11e7 to 26e7 */ { B1Mod = 6818.18182 * (1. - ((cur_B1 - 110000000.) / 150000000.)); B1Mod +=11346.1539 * (1. - ((260000000. - cur_B1) / 150000000.)); } else if (cur_B1 < 850000000.) /* 52000 curves from B1=26e7 to 85e7 */ { B1Mod = 11346.1539 * (1. - ((cur_B1 - 260000000.) / 590000000.)); B1Mod +=24698.8 * (1. - ((850000000. - cur_B1) / 590000000.)); } else if (cur_B1 < 2900000000.) /* 83000 curves from B1=85e7 to 29e8 */ { B1Mod = 24698.8 * (1. - ((cur_B1 - 850000000.) / 2050000000.)); B1Mod +=50000.0 * (1. - ((2900000000. - cur_B1) / 2050000000.)); } else B1Mod = 50000.; return floor (cur_B1 + const_adj*(B1Mod*incB1val) + 0.5); } /* Here is my "third" attempt at a B1 adjustment function. * It seems to adjust too quickly */ double B1Min[12] = { 2000.0, 11000.0, 50000.0, 250000.0, 1000000.0, 3000000.0, 11000000.0, 43000000.0, 110000000.0, 260000000.0, 850000000.0, 2900000000.0 }; double B1Max[12] = { 11000.0, 50000.0, 250000.0, 1000000.0, 3000000.0, 11000000.0, 43000000.0, 110000000.0, 260000000.0, 850000000.0, 2900000000.0, 9000000000.0 }; double B1Inc[12] = { 300.0, 433.334, 833.334, 1500.0, 1818.1819, 2758.621, 5818.1819, 7444.4445, 6818.1819, 11346.1539, 24698.8, 50000.0 }; /*B1Table_t B1Table[12] = {300,0 ,2000.0 ,11000.0 }, {433.334, ,11000.0 ,50000.0 }, {833.334, ,50000.0 ,250000.0 }, {1500.0 ,250000.0 ,1000000.0 }, {1818.1819, ,1000000.0 ,3000000.0 }, {2758.621, ,3000000.0 ,11000000.0 }, {5818.1819, ,11000000.0 ,43000000.0 }, {7444.4445, ,43000000.0 ,110000000.0 }, {6818.1819, ,110000000.0 ,260000000.0 }, NOTE the increment does not look larger enough here!! {11346.1539, ,260000000.0 ,850000000.0 }, {24698.8, ,850000000.0 ,2900000000.0 }, {50000.0, ,2900000000.0 ,9000000000.0 }; */ double calc_B1_AutoIncrement_v3 (double cur_B1, double incB1val, int calcInc) { double B1Mod; if (!calcInc) return cur_B1 + incB1val; /* incB1val is a constant to add to B1 */ /* This simple table was "created" based upon the "Optimal B1 table" in the README file */ if (cur_B1 < 2000.) B1Mod = 200.; else if (cur_B1 > 2900000000.) B1Mod = 50000; else { double OrigMin; int i = 0; while (i < 11 && B1Max[i] < cur_B1) ++i; B1Mod = B1Inc[i] * (1. - ((cur_B1 - B1Min[i]) / (B1Max[i] - B1Min[i]))); OrigMin = B1Min[i]; while (++i < 12) { B1Mod += B1Inc[i] * (1. - ((B1Min[i] - cur_B1) / (B1Min[i] - OrigMin))); } } return floor (cur_B1 + (B1Mod*incB1val) + 0.5); } ecm-6.4.4/ecm-gmp.h0000644023561000001540000001152312110713341010704 00000000000000/* Part of file gmp-impl.h from GNU MP. Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002 Free Software Foundation, Inc. This file contains modified code from the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef _ECM_GMP_H #define _ECM_GMP_H 1 #include "config.h" #include #ifndef alloca #ifdef __GNUC__ # define alloca __builtin_alloca #elif defined (__DECC) # define alloca(x) __ALLOCA(x) #elif defined (_MSC_VER) # include # define alloca _alloca #elif defined(HAVE_ALLOCA_H) || defined (sun) # include #elif defined (_AIX) || defined (_IBMR2) #pragma alloca #else char *alloca (); #endif #endif #define ABSIZ(x) ABS (SIZ (x)) #define ALLOC(x) ((x)->_mp_alloc) #define PTR(x) ((x)->_mp_d) #define SIZ(x) ((x)->_mp_size) #define TMP_DECL(m) #define TMP_ALLOC(x) alloca(x) #define TMP_MARK(m) #define TMP_FREE(m) #define TMP_ALLOC_TYPE(n,type) ((type *) TMP_ALLOC ((n) * sizeof (type))) #define TMP_ALLOC_LIMBS(n) TMP_ALLOC_TYPE(n,mp_limb_t) #ifndef MPZ_REALLOC #define MPZ_REALLOC(z,n) ((n) > ALLOC(z) ? _mpz_realloc(z,n) : PTR(z)) #endif #ifndef MPN_COPY #include /* for memcpy */ #define MPN_COPY(d,s,n) memcpy((d),(s),(n)*sizeof(mp_limb_t)) #endif #ifndef MPN_NORMALIZE #define MPN_NORMALIZE(DST, NLIMBS) \ do { \ while (NLIMBS > 0) \ { \ if ((DST)[(NLIMBS) - 1] != 0) \ break; \ NLIMBS--; \ } \ } while (0) #endif #ifndef MPN_ZERO #define MPN_ZERO(dst, n) \ do { \ if ((n) != 0) \ { \ mp_ptr __dst = (dst); \ mp_size_t __n = (n); \ do \ *__dst++ = 0; \ while (--__n); \ } \ } while (0) #endif /* Return non-zero if xp,xsize and yp,ysize overlap. If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no overlap. If both these are false, there's an overlap. */ #define MPN_OVERLAP_P(xp, xsize, yp, ysize) \ ((xp) + (xsize) > (yp) && (yp) + (ysize) > (xp)) /* Return non-zero if xp,xsize and yp,ysize are either identical or not overlapping. Return zero if they're partially overlapping. */ #define MPN_SAME_OR_SEPARATE_P(xp, yp, size) \ MPN_SAME_OR_SEPARATE2_P(xp, size, yp, size) #define MPN_SAME_OR_SEPARATE2_P(xp, xsize, yp, ysize) \ ((xp) == (yp) || ! MPN_OVERLAP_P (xp, xsize, yp, ysize)) #ifndef mpn_com_n #define mpn_com_n(d,s,n) \ do { \ mp_ptr __d = (d); \ mp_srcptr __s = (s); \ mp_size_t __n = (n); \ ASSERT (__n >= 1); \ ASSERT (MPN_SAME_OR_SEPARATE_P (__d, __s, __n)); \ do \ *__d++ = (~ *__s++) & GMP_NUMB_MASK; \ while (--__n); \ } while (0) #endif #ifdef HAVE___GMPN_ADD_NC #ifndef __gmpn_add_nc __GMP_DECLSPEC mp_limb_t __gmpn_add_nc (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t); #endif #endif #define ECM_VERSION_NUM(a,b,c) (((a) << 16L) | ((b) << 8) | (c)) #if !defined( __MPIR_RELEASE ) && ECM_VERSION_NUM(__GNU_MP_VERSION,__GNU_MP_VERSION_MINOR,__GNU_MP_VERSION_PATCHLEVEL) >= ECM_VERSION_NUM(5,1,0) #define MPN_REDC12_RETURNS_CARRY 1 #endif /* GMP currently does not define prototypes for these, but MPIR does */ #if defined(HAVE___GMPN_REDC_1) && !defined( __MPIR_RELEASE ) #ifdef MPN_REDC12_RETURNS_CARRY mp_limb_t __gmpn_redc_1 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); #else void __gmpn_redc_1 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); #endif #endif #if defined(HAVE___GMPN_REDC_2) && !defined( __MPIR_RELEASE ) #ifdef MPN_REDC12_RETURNS_CARRY mp_limb_t __gmpn_redc_2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #else void __gmpn_redc_2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif #endif #if defined(HAVE___GMPN_REDC_N) void __gmpn_redc_n (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif #if defined(HAVE___GMPN_MULLO_N) void __gmpn_mullo_n (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); #endif #endif /* _ECM_GMP_H */ ecm-6.4.4/mul_fft-params.h.pentium40000644023561000001540000001073112106741273014052 00000000000000#define MUL_FFT_MODF_THRESHOLD 480 #define SQR_FFT_MODF_THRESHOLD 480 #define MUL_FFT_TABLE2 {{1, 4 /*66*/}, {305, 5 /*95*/}, {321, 4 /*97*/}, {337, 5 /*95*/}, {353, 4 /*97*/}, {369, 5 /*96*/}, {801, 6 /*96*/}, {1281, 7 /*91*/}, {1409, 6 /*97*/}, {1601, 7 /*92*/}, {1921, 6 /*98*/}, {1985, 7 /*94*/}, {2689, 8 /*91*/}, {2817, 7 /*95*/}, {3201, 8 /*92*/}, {3329, 7 /*96*/}, {3457, 8 /*87*/}, {3841, 7 /*96*/}, {3969, 8 /*88*/}, {4865, 7 /*97*/}, {4993, 8 /*90*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {8961, 9 /*90*/}, {9729, 8 /*97*/}, {9985, 9 /*83*/}, {11777, 8 /*97*/}, {12033, 9 /*85*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {15873, 8 /*98*/}, {16129, 9 /*88*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {26113, 10 /*81*/}, {31745, 9 /*98*/}, {34305, 10 /*85*/}, {39937, 9 /*98*/}, {40449, 10 /*83*/}, {48129, 11 /*75*/}, {63489, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {129025, 9 /*98*/}, {130561, 11 /*80*/}, {194561, 12 /*75*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 9 /*99*/}, {278017, 10 /*94*/}, {293889, 9 /*99*/}, {294401, 7 /*99*/}, {294529, 8 /*99*/}, {294657, 10 /*94*/}, {310273, 9 /*99*/}, {310785, 10 /*95*/}, {326657, 12 /*83*/}, {389121, 13 /*75*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {662529, 11 /*96*/}, {686081, 10 /*99*/}, {687105, 9 /*99*/}, {687617, 11 /*95*/}, {718849, 10 /*99*/}, {752641, 9 /*99*/}, {753153, 11 /*95*/}, {784385, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {980993, 10 /*99*/}, {982017, 12 /*93*/}, {LONG_MAX, 0}} #define MUL_FFTM_TABLE2 {{1, 4 /*66*/}, {273, 5 /*94*/}, {289, 4 /*97*/}, {305, 5 /*95*/}, {609, 6 /*95*/}, {641, 5 /*97*/}, {673, 6 /*95*/}, {705, 5 /*97*/}, {737, 6 /*96*/}, {1473, 7 /*96*/}, {1537, 6 /*98*/}, {1601, 7 /*96*/}, {1665, 6 /*98*/}, {1729, 7 /*96*/}, {2689, 8 /*91*/}, {2817, 7 /*97*/}, {2945, 8 /*92*/}, {3329, 7 /*98*/}, {3457, 8 /*93*/}, {5377, 9 /*91*/}, {5633, 8 /*95*/}, {6401, 9 /*92*/}, {6657, 8 /*96*/}, {6913, 9 /*87*/}, {7681, 8 /*96*/}, {7937, 9 /*88*/}, {8705, 8 /*97*/}, {8961, 9 /*90*/}, {13825, 10 /*87*/}, {15361, 9 /*96*/}, {17921, 10 /*90*/}, {19457, 9 /*97*/}, {19969, 10 /*83*/}, {23553, 9 /*97*/}, {24065, 10 /*85*/}, {27649, 11 /*87*/}, {30721, 10 /*96*/}, {31745, 9 /*98*/}, {32257, 10 /*88*/}, {39937, 11 /*83*/}, {47105, 10 /*97*/}, {48129, 12 /*75*/}, {61441, 11 /*96*/}, {63489, 10 /*98*/}, {68609, 11 /*85*/}, {79873, 10 /*98*/}, {80897, 11 /*83*/}, {96257, 12 /*75*/}, {126977, 11 /*98*/}, {161793, 12 /*83*/}, {192513, 13 /*75*/}, {253953, 12 /*98*/}, {258049, 10 /*98*/}, {261121, 9 /*99*/}, {261633, 10 /*94*/}, {277505, 12 /*85*/}, {323585, 10 /*99*/}, {326657, 9 /*99*/}, {327169, 10 /*95*/}, {330753, 12 /*84*/}, {389121, 10 /*99*/}, {392193, 9 /*99*/}, {392705, 10 /*96*/}, {408577, 9 /*99*/}, {409089, 8 /*99*/}, {409345, 10 /*96*/}, {412673, 12 /*90*/}, {454657, 13 /*87*/}, {516097, 11 /*98*/}, {522241, 10 /*99*/}, {523265, 11 /*94*/}, {555009, 10 /*99*/}, {556033, 9 /*99*/}, {556545, 11 /*94*/}, {587777, 10 /*99*/}, {588801, 11 /*94*/}, {620545, 10 /*99*/}, {621569, 9 /*99*/}, {622081, 11 /*95*/}, {653313, 10 /*99*/}, {654337, 11 /*95*/}, {686081, 13 /*87*/}, {778241, 11 /*99*/}, {817153, 10 /*99*/}, {818177, 9 /*99*/}, {818689, 11 /*96*/}, {849921, 10 /*99*/}, {850945, 11 /*96*/}, {882689, 10 /*99*/}, {883713, 9 /*99*/}, {884225, 11 /*96*/}, {915457, 12 /*93*/}, {978945, 14 /*93*/}, {LONG_MAX, 0}} #define MUL_FFT_FULL_TABLE2 {{100, 2}, {216, 1}, {256, 2}, {264, 1}, {304, 2}, {312, 1}, {544, 4}, {560, 1}, {704, 2}, {720, 1}, {896, 2}, {960, 7}, {40960, 2}, {47616, 1}, {49152, 6}, {53760, 4}, {56320, 1}, {64512, 4}, {71680, 5}, {86016, 2}, {96768, 4}, {99840, 1}, {131072, 6}, {136192, 7}, {147456, 6}, {150528, 4}, {161280, 1}, {161792, 3}, {172032, 2}, {193536, 1}, {259072, 6}, {286720, 7}, {294912, 6}, {301056, 4}, {322560, 3}, {344064, 2}, {387072, 1}, {393216, 4}, {404480, 3}, {409600, 1}, {417792, 3}, {425984, 1}, {524288, 6}, {530432, 7}, {557056, 6}, {566272, 5}, {577536, 4}, {593920, 6}, {602112, 5}, {614400, 4}, {645120, 3}, {647168, 4}, {652800, 1}, {654336, 6}, {673792, 3}, {688128, 2}, {724992, 4}, {727040, 1}, {753664, 2}, {783360, 4}, {816640, 6}, {831488, 1}, {851968, 2}, {860160, 3}, {868352, 2}, {881664, 7}, {884736, 1}, {921600, 7}, {950272, 1}, {LONG_MAX, 1}} ecm-6.4.4/ecm-impl.h0000644023561000001540000007452112110712022011064 00000000000000/* ecm-impl.h - header file for libecm Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Paul Zimmermann, Alexander Kruppa and Cyril Bouvier. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef _ECM_IMPL_H #define _ECM_IMPL_H 1 #include "config.h" #include "ecm.h" #ifdef HAVE_SYS_TYPES_H #include /* needed for size_t */ #endif #if HAVE_STDINT_H #include /* needed for int64_t and uint64_t */ /* or configure will define these for us if possible */ #endif #if defined UINT64_MAX || defined uint64_t typedef int64_t ecm_int; typedef uint64_t ecm_uint; #define ECM_INT_MAX INT64_MAX #define ECM_UINT_MAX UINT64_MAX #elif defined HAVE_LONG_LONG_INT typedef long long ecm_int; typedef unsigned long long ecm_uint; #define ECM_INT_MAX LLONG_MAX #define ECM_UINT_MAX ULLONG_MAX #else typedef long ecm_int; typedef unsigned long ecm_uint; #define ECM_INT_MAX LONG_MAX #define ECM_UINT_MAX ULONG_MAX #endif #ifndef TUNE #include "ecm-params.h" #else extern size_t MPZMOD_THRESHOLD; extern size_t REDC_THRESHOLD; #endif extern size_t mpn_mul_lo_threshold[]; #include /* needed for "FILE *" */ #include #if defined (__STDC__) \ || defined (__cplusplus) \ || defined (_AIX) \ || defined (__DECC) \ || (defined (__mips) && defined (_SYSTYPE_SVR4)) \ || defined (_MSC_VER) \ || defined (_WIN32) #define __ECM_HAVE_TOKEN_PASTE 1 #else #define __ECM_HAVE_TOKEN_PASTE 0 #endif #ifndef __ECM #if __ECM_HAVE_TOKEN_PASTE #define __ECM(x) __ecm_##x #else #define __ECM(x) __ecm_/**/x #endif #endif #define ECM_STDOUT __ecm_stdout #define ECM_STDERR __ecm_stderr extern FILE *ECM_STDOUT, *ECM_STDERR; /* Warnings about unused parameters by gcc can be suppressed by prefixing parameter with ATTRIBUTE_UNUSED when parameter can't be removed, i.e. for interface consistency reasons */ #ifdef __GNUC__ #if __GNUC__ >= 3 #define ATTRIBUTE_UNUSED __attribute__ ((unused)) #else #define ATTRIBUTE_UNUSED #endif #define ATTRIBUTE_CONST __attribute__ ((const)) #else #define ATTRIBUTE_UNUSED #define ATTRIBUTE_CONST #endif #ifndef LIKELY #if defined(__GNUC__) #define LIKELY(x) __builtin_expect ((x) != 0, 1) #else #define LIKELY(x) x #endif #endif #ifndef UNLIKELY #if defined(__GNUC__) #define UNLIKELY(x) __builtin_expect ((x) != 0, 0) #else #define UNLIKELY(x) x #endif #endif /* default B2 choice: pow (B1 * METHOD_COST / 6.0, DEFAULT_B2_EXPONENT) */ #define DEFAULT_B2_EXPONENT 1.43 #define PM1_COST 1.0 / 6.0 #define PP1_COST 2.0 / 6.0 #define ECM_COST 11.0 / 6.0 /* For new P-/+1 stage 2: */ #define PM1FS2_DEFAULT_B2_EXPONENT 1.7 #define PM1FS2_COST 1.0 / 4.0 #define PP1FS2_COST 1.0 / 4.0 /* if POLYEVALTELLEGEN is defined, use polyeval_tellegen(), otherwise use polyeval() */ #define POLYEVALTELLEGEN /* use Kronecker-Scho"nhage's multiplication */ #define KS_MULTIPLY /* define top-level multiplication */ #define KARA 2 #define TOOM3 3 #define TOOM4 4 #define KS 5 #define NTT 6 /* maximal limb size of assembly mulredc */ #define MULREDC_ASSEMBLY_MAX 20 #include "sp.h" /* compile with -DMULT=2 to override default */ #ifndef MULT #ifdef KS_MULTIPLY #define MULT KS #else #define MULT TOOM4 #endif #endif #ifdef POLYEVALTELLEGEN #define USE_SHORT_PRODUCT #endif #include #define ASSERT_ALWAYS(expr) assert (expr) #ifdef WANT_ASSERT #define ASSERT(expr) assert (expr) #else #define ASSERT(expr) do {} while (0) #endif #ifdef MEMORY_DEBUG void tests_free (void *, size_t); void tests_memory_set_location (char *, unsigned int); #define FREE(ptr,size) tests_free(ptr,size) #define MEMORY_TAG tests_memory_set_location(__FILE__,__LINE__) #define MEMORY_UNTAG tests_memory_set_location("",0) #define MPZ_INIT(x) {MEMORY_TAG;mpz_init(x);MEMORY_UNTAG;} #define MPZ_INIT2(x,n) {MEMORY_TAG;mpz_init2(x,n);MEMORY_UNTAG;} #else #define FREE(ptr,size) free(ptr) #define MEMORY_TAG do{}while(0) #define MEMORY_UNTAG do{}while(0) #define MPZ_INIT(x) mpz_init(x) #define MPZ_INIT2(x,n) mpz_init2(x,n) #endif /* thresholds */ #define MPN_MUL_LO_THRESHOLD 32 /* base2mod is used when size(2^n+/-1) <= BASE2_THRESHOLD * size(cofactor) */ #define BASE2_THRESHOLD 1.4 /* default number of probable prime tests */ #define PROBAB_PRIME_TESTS 1 /* kronecker_schonhage() is used instead of toomcook4() when bitsize(poly) >= KS_MUL_THRESHOLD */ #define KS_MUL_THRESHOLD 1e6 /* same for median product */ #define KS_TMUL_THRESHOLD 8e5 #define ABS(x) ((x) >= 0 ? (x) : -(x)) /* getprime */ #define WANT_FREE_PRIME_TABLE(p) (p < 0.0) #define FREE_PRIME_TABLE -1.0 /* 2^n+-1 with n < MOD_MINBASE2 cannot use base-2 reduction */ #define MOD_MINBASE2 16 /* Various logging levels */ /* OUTPUT_ALWAYS means print always, regardless of verbose value */ #define OUTPUT_ALWAYS 0 /* OUTPUT_NORMAL means print during normal program execution */ #define OUTPUT_NORMAL 1 /* OUTPUT_VERBOSE means print if the user requested more verbosity */ #define OUTPUT_VERBOSE 2 /* OUTPUT_RESVERBOSE is for printing residues (after stage 1 etc) */ #define OUTPUT_RESVERBOSE 3 /* OUTPUT_DEVVERBOSE is for printing internal parameters (for developers) */ #define OUTPUT_DEVVERBOSE 4 /* OUTPUT_TRACE is for printing trace data, produces lots of output */ #define OUTPUT_TRACE 5 /* OUTPUT_ERROR is for printing error messages */ #define OUTPUT_ERROR -1 /* Interval length for writing checkpoints in stage 1, in milliseconds */ #define CHKPNT_PERIOD 600000 typedef mpz_t mpres_t; typedef mpz_t* listz_t; typedef struct { mpres_t x; mpres_t y; } __point_struct; typedef __point_struct point; typedef struct { mpres_t x; mpres_t y; mpres_t A; } __curve_struct; typedef __curve_struct curve; typedef struct { unsigned long d1; unsigned long d2; mpz_t i0; int S; } __root_params_t; typedef __root_params_t root_params_t; typedef struct { unsigned long P, s_1, s_2, l; mpz_t m_1; } __faststage2_param_t; typedef __faststage2_param_t faststage2_param_t; #define EC_MONTGOMERY_FORM 0 #define EC_WEIERSTRASS_FORM 1 typedef struct { unsigned int size_fd; /* How many entries .fd has, always nr * (S+1) */ unsigned int nr; /* How many separate progressions there are */ unsigned int next; /* From which progression to take the next root */ unsigned int S; /* Degree of the polynomials */ unsigned int dsieve; /* Values not coprime to dsieve are skipped */ unsigned int rsieve; /* Which residue mod dsieve current .next belongs to */ int dickson_a; /* Parameter for Dickson polynomials */ } progression_params_t; typedef struct { progression_params_t params; point *fd; unsigned int size_T; /* How many entries T has */ mpres_t *T; /* For temp values. FIXME: should go! */ curve *X; /* The curve the points are on */ } ecm_roots_state_t; typedef struct { progression_params_t params; mpres_t *fd; int invtrick; } pm1_roots_state_t; typedef struct { progression_params_t params; point *fd; /* for S != 1 */ mpres_t tmp[4]; /* for S=1 */ } pp1_roots_state_t; typedef struct { int alloc; int degree; listz_t coeff; } __polyz_struct; typedef __polyz_struct polyz_t[1]; typedef struct { int repr; /* ECM_MOD_MPZ: plain modulus, possibly normalized ECM_MOD_BASE2: base 2 number ECM_MOD_MODMULN: MODMULN ECM_MOD_REDC: REDC representation */ int bits; /* in case of a base 2 number, 2^k[+-]1, bits = [+-]k in case of MODMULN or REDC representation, nr. of bits b so that 2^b > orig_modulus and GMP_NUMB_BITS | b */ int Fermat; /* If repr = 1 (base 2 number): If modulus is 2^(2^m)+1, i.e. bits = 2^m, then Fermat = 2^m, 0 otherwise. If repr != 1, undefined */ mp_limb_t *Nprim; /* For MODMULN */ mpz_t orig_modulus; /* The original modulus N */ mpz_t aux_modulus; /* Used only for MPZ and REDC: - the auxiliary modulus value (i.e. normalized modulus, or -1/N (mod 2^bits) for REDC, - B^(n + ceil(n/2)) mod N for MPZ, where B = 2^GMP_NUMB_BITS */ mpz_t multiple; /* The smallest multiple of N that is larger or equal to 2^bits for REDC/MODMULN */ mpz_t R2, R3; /* For MODMULN and REDC, R^2 and R^3 (mod orig_modulus), where R = 2^bits. */ mpz_t temp1, temp2; /* Temp values used during multiplication etc. */ } __mpmod_struct; typedef __mpmod_struct mpmod_t[1]; #if defined (__cplusplus) extern "C" { #endif /* getprime.c */ #define getprime __ECM(getprime) double getprime (); #define getprime_clear __ECM(getprime_clear) void getprime_clear (); #define getprime_seek __ECM(getprime_seek) void getprime_seek (double); /* pm1.c */ #define pm1_rootsF __ECM(pm1_rootsF) int pm1_rootsF (mpz_t, listz_t, root_params_t *, unsigned long, mpres_t *, listz_t, mpmod_t); #define pm1_rootsG_init __ECM(pm1_rootsG_init) pm1_roots_state_t* pm1_rootsG_init (mpres_t *, root_params_t *, mpmod_t); #define pm1_rootsG __ECM(pm1_rootsG) int pm1_rootsG (mpz_t, listz_t, unsigned long, pm1_roots_state_t *, listz_t, mpmod_t); #define pm1_rootsG_clear __ECM(pm1_rootsG_clear) void pm1_rootsG_clear (pm1_roots_state_t *, mpmod_t); /* pm1fs2.c */ #define pm1fs2_memory_use __ECM(pm1fs2_ntt_memory_use) size_t pm1fs2_memory_use (const unsigned long, const mpz_t, const int); #define pm1fs2_maxlen __ECM(pm1fs2_maxlen) unsigned long pm1fs2_maxlen (const size_t, const mpz_t, const int); #define pp1fs2_memory_use __ECM(pp1fs2_ntt_memory_use) size_t pp1fs2_memory_use (const unsigned long, const mpz_t, const int, const int); #define pp1fs2_maxlen __ECM(pp1fs2_maxlen) unsigned long pp1fs2_maxlen (const size_t, const mpz_t, const int, const int); #define choose_P __ECM(choose_P) long choose_P (const mpz_t, const mpz_t, const unsigned long, const unsigned long, faststage2_param_t *, mpz_t, mpz_t, const int, const int); #define pm1fs2 __ECM(pm1fs2) int pm1fs2 (mpz_t, const mpres_t, mpmod_t, const faststage2_param_t *); #define pm1fs2_ntt __ECM(pm1fs2_ntt) int pm1fs2_ntt (mpz_t, const mpres_t, mpmod_t, const faststage2_param_t *); #define pp1fs2 __ECM(pp1fs2) int pp1fs2 (mpz_t, const mpres_t, mpmod_t, const faststage2_param_t *); #define pp1fs2_ntt __ECM(pp1fs2_ntt) int pp1fs2_ntt (mpz_t, const mpres_t, mpmod_t, const faststage2_param_t *, const int); /* bestd.c */ #define bestD __ECM(bestD) int bestD (root_params_t *, unsigned long *, unsigned long *, mpz_t, mpz_t, int, int, double, int, mpmod_t); /* ecm.c */ #define choose_S __ECM(choose_S) int choose_S (mpz_t); #define add3 __ECM(add3) void add3 (mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t, mpmod_t, mpres_t, mpres_t, mpres_t); #define duplicate __ECM(duplicate) void duplicate (mpres_t, mpres_t, mpres_t, mpres_t, mpmod_t, mpres_t, mpres_t, mpres_t, mpres_t); #define ecm_mul __ECM(ecm_mul) void ecm_mul (mpres_t, mpres_t, mpz_t, mpmod_t, mpres_t); #define print_B1_B2_poly __ECM(print_B1_B2_poly) void print_B1_B2_poly (int, int, double, double, mpz_t, mpz_t, mpz_t, int S, mpz_t, int, mpz_t); /* ecm2.c */ #define ecm_rootsF __ECM(ecm_rootsF) int ecm_rootsF (mpz_t, listz_t, root_params_t *, unsigned long, curve *, mpmod_t); #define ecm_rootsG_init __ECM(ecm_rootsG_init) ecm_roots_state_t* ecm_rootsG_init (mpz_t, curve *, root_params_t *, unsigned long, unsigned long, mpmod_t); #define ecm_rootsG __ECM(ecm_rootsG) int ecm_rootsG (mpz_t, listz_t, unsigned long, ecm_roots_state_t *, mpmod_t); #define ecm_rootsG_clear __ECM(ecm_rootsG_clear) void ecm_rootsG_clear (ecm_roots_state_t *, mpmod_t); #define ecm_findmatch __ECM(ecm_findmatch) int ecm_findmatch (unsigned long *, const unsigned long, root_params_t *, const curve *, mpmod_t, const mpz_t); /* lucas.c */ #define pp1_mul_prac __ECM(pp1_mul_prac) void pp1_mul_prac (mpres_t, ecm_uint, mpmod_t, mpres_t, mpres_t, mpres_t, mpres_t, mpres_t); /* pp1.c */ #define pp1_rootsF __ECM(pp1_rootsF) int pp1_rootsF (listz_t, root_params_t *, unsigned long, mpres_t *, listz_t, mpmod_t); #define pp1_rootsG __ECM(pp1_rootsG) int pp1_rootsG (listz_t, unsigned long, pp1_roots_state_t *, mpmod_t, mpres_t*); #define pp1_rootsG_init __ECM(pp1_rootsG_init) pp1_roots_state_t* pp1_rootsG_init (mpres_t*, root_params_t *, mpmod_t); #define pp1_rootsG_clear __ECM(pp1_rootsG_clear) void pp1_rootsG_clear (pp1_roots_state_t *, mpmod_t); /* stage2.c */ #define stage2 __ECM(stage2) int stage2 (mpz_t, void *, mpmod_t, unsigned long, unsigned long, root_params_t *, int, int, char *, int (*)(void)); #define init_progression_coeffs __ECM(init_progression_coeffs) listz_t init_progression_coeffs (mpz_t, const unsigned long, const unsigned long, const unsigned int, const unsigned int, const unsigned int, const int); #define init_roots_params __ECM(init_roots_params) void init_roots_params (progression_params_t *, const int, const unsigned long, const unsigned long, const double); #define memory_use __ECM(memory_use) double memory_use (unsigned long, unsigned int, unsigned int, mpmod_t); /* listz.c */ #define list_mul_mem __ECM(list_mul_mem) int list_mul_mem (unsigned int); #define init_list __ECM(init_list) listz_t init_list (unsigned int); #define init_list2 __ECM(init_list2) listz_t init_list2 (unsigned int, unsigned int); #define clear_list __ECM(clear_list) void clear_list (listz_t, unsigned int); #define list_inp_raw __ECM(list_inp_raw) int list_inp_raw (listz_t, FILE *, unsigned int); #define list_out_raw __ECM(list_out_raw) int list_out_raw (FILE *, listz_t, unsigned int); #define print_list __ECM(print_list) void print_list (listz_t, unsigned int); #define list_set __ECM(list_set) void list_set (listz_t, listz_t, unsigned int); #define list_revert __ECM(list_revert) void list_revert (listz_t, unsigned int); #define list_swap __ECM(list_swap) void list_swap (listz_t, listz_t, unsigned int); #define list_neg __ECM(list_neg) void list_neg (listz_t, listz_t, unsigned int, mpz_t); #define list_mod __ECM(list_mod) void list_mod (listz_t, listz_t, unsigned int, mpz_t); #define list_add __ECM(list_add) void list_add (listz_t, listz_t, listz_t, unsigned int); #define list_sub __ECM(list_sub) void list_sub (listz_t, listz_t, listz_t, unsigned int); #define list_mul_z __ECM(list_mul_z) void list_mul_z (listz_t, listz_t, mpz_t, unsigned int, mpz_t); #define list_gcd __ECM(list_gcd) int list_gcd (mpz_t, listz_t, unsigned int, mpz_t); #define list_mulup __ECM(list_mulup) void list_mulup (listz_t, unsigned int, mpz_t, mpz_t); #define list_zero __ECM(list_zero) void list_zero (listz_t, unsigned int); #define list_mul __ECM(list_mul) void list_mul (listz_t, listz_t, unsigned int, int, listz_t, unsigned int, int, listz_t); #define list_mul_high __ECM(list_mul_high) void list_mul_high (listz_t, listz_t, listz_t, unsigned int, listz_t); #define karatsuba __ECM(karatsuba) void karatsuba (listz_t, listz_t, listz_t, unsigned int, listz_t); #define list_mulmod __ECM(list_mulmod) void list_mulmod (listz_t, listz_t, listz_t, listz_t, unsigned int, listz_t, mpz_t); #define list_invert __ECM(list_invert) int list_invert (listz_t, listz_t, unsigned long, mpz_t, mpmod_t); #define PolyFromRoots __ECM(PolyFromRoots) void PolyFromRoots (listz_t, listz_t, unsigned int, listz_t, mpz_t); #define PolyFromRoots_Tree __ECM(PolyFromRoots_Tree) int PolyFromRoots_Tree (listz_t, listz_t, unsigned int, listz_t, int, mpz_t, listz_t*, FILE*, unsigned int); #define ntt_PolyFromRoots __ECM(ntt_PolyFromRoots) void ntt_PolyFromRoots (mpzv_t, mpzv_t, spv_size_t, mpzv_t, mpzspm_t); #define ntt_PolyFromRoots_Tree __ECM(ntt_PolyFromRoots_Tree) int ntt_PolyFromRoots_Tree (mpzv_t, mpzv_t, spv_size_t, mpzv_t, int, mpzspm_t, mpzv_t *, FILE *); #define ntt_polyevalT __ECM(ntt_polyevalT) int ntt_polyevalT (mpzv_t, spv_size_t, mpzv_t *, mpzv_t, mpzspv_t, mpzspm_t, char *); #define ntt_mul __ECM(ntt_mul) void ntt_mul (mpzv_t, mpzv_t, mpzv_t, spv_size_t, mpzv_t, int, mpzspm_t); #define ntt_PrerevertDivision __ECM(ntt_PrerevertDivision) void ntt_PrerevertDivision (mpzv_t, mpzv_t, mpzv_t, mpzspv_t, mpzspv_t, spv_size_t, mpzv_t, mpzspm_t); #define ntt_PolyInvert __ECM(ntt_PolyInvert) void ntt_PolyInvert (mpzv_t, mpzv_t, spv_size_t, mpzv_t, mpzspm_t); #define PrerevertDivision __ECM(PrerevertDivision) int PrerevertDivision (listz_t, listz_t, listz_t, unsigned int, listz_t, mpz_t); #define PolyInvert __ECM(PolyInvert) void PolyInvert (listz_t, listz_t, unsigned int, listz_t, mpz_t); #define RecursiveDivision __ECM(RecursiveDivision) void RecursiveDivision (listz_t, listz_t, listz_t, unsigned int, listz_t, mpz_t, int); /* polyeval.c */ #define polyeval __ECM(polyeval) void polyeval (listz_t, unsigned int, listz_t*, listz_t, mpz_t, unsigned int); #define polyeval_tellegen __ECM(polyeval_tellegen) int polyeval_tellegen (listz_t, unsigned int, listz_t*, listz_t, unsigned int, listz_t, mpz_t, char *); #define TUpTree __ECM(TUpTree) void TUpTree (listz_t, listz_t *, unsigned int, listz_t, int, unsigned int, mpz_t, FILE *); /* toomcook.c */ #define toomcook3 __ECM(toomcook3) void toomcook3 (listz_t, listz_t, listz_t, unsigned int, listz_t); #define toomcook4 __ECM(toomcook4) void toomcook4 (listz_t, listz_t, listz_t, unsigned int, listz_t); /* ks-multiply.c */ #define kronecker_schonhage __ECM(kronecker_schonhage) void kronecker_schonhage (listz_t, listz_t, listz_t, unsigned int, listz_t); #define TMulKS __ECM(TMulKS) int TMulKS (listz_t, unsigned int, listz_t, unsigned int, listz_t, unsigned int, mpz_t, int); #define ks_wrapmul_m __ECM(ks_wrapmul_m) unsigned int ks_wrapmul_m (unsigned int, unsigned int, mpz_t); #define ks_wrapmul __ECM(ks_wrapmul) unsigned int ks_wrapmul (listz_t, unsigned int, listz_t, unsigned int, listz_t, unsigned int, mpz_t); /* mpmod.c */ /* Define MPRESN_NO_ADJUSTMENT if mpresn_add, mpresn_sub and mpresn_addsub should perform no adjustment step. This yields constraints on N. */ /* #define MPRESN_NO_ADJUSTMENT */ #define isbase2 __ECM(isbase2) int isbase2 (const mpz_t, const double); #define mpmod_init __ECM(mpmod_init) int mpmod_init (mpmod_t, const mpz_t, int); #define mpmod_init_MPZ __ECM(mpmod_init_MPZ) void mpmod_init_MPZ (mpmod_t, const mpz_t); #define mpmod_init_BASE2 __ECM(mpmod_init_BASE2) int mpmod_init_BASE2 (mpmod_t, const int, const mpz_t); #define mpmod_init_MODMULN __ECM(mpmod_init_MODMULN) void mpmod_init_MODMULN (mpmod_t, const mpz_t); #define mpmod_init_REDC __ECM(mpmod_init_REDC) void mpmod_init_REDC (mpmod_t, const mpz_t); #define mpmod_clear __ECM(mpmod_clear) void mpmod_clear (mpmod_t); #define mpmod_init_set __ECM(mpmod_init_set) void mpmod_init_set (mpmod_t, const mpmod_t); #define mpmod_pausegw __ECM(mpmod_pausegw) void mpmod_pausegw (const mpmod_t modulus); #define mpmod_contgw __ECM(mpmod_contgw) void mpmod_contgw (const mpmod_t modulus); #define mpres_equal __ECM(mpres_equal) int mpres_equal (const mpres_t, const mpres_t, mpmod_t); #define mpres_pow __ECM(mpres_pow) void mpres_pow (mpres_t, const mpres_t, const mpz_t, mpmod_t); #define mpres_ui_pow __ECM(mpres_ui_pow) void mpres_ui_pow (mpres_t, const unsigned long, const mpres_t, mpmod_t); #define mpres_mul __ECM(mpres_mul) void mpres_mul (mpres_t, const mpres_t, const mpres_t, mpmod_t) ATTRIBUTE_HOT; #define mpres_sqr __ECM(mpres_sqr) void mpres_sqr (mpres_t, const mpres_t, mpmod_t) ATTRIBUTE_HOT; #define mpres_mul_z_to_z __ECM(mpres_mul_z_to_z) void mpres_mul_z_to_z (mpz_t, const mpres_t, const mpz_t, mpmod_t); #define mpres_set_z_for_gcd __ECM(mpres_set_z_for_gcd) void mpres_set_z_for_gcd (mpres_t, const mpz_t, mpmod_t); #define mpres_div_2exp __ECM(mpres_div_2exp) void mpres_div_2exp (mpres_t, const mpres_t, const unsigned int, mpmod_t); #define mpres_add_ui __ECM(mpres_add_ui) void mpres_add_ui (mpres_t, const mpres_t, const unsigned long, mpmod_t); #define mpres_add __ECM(mpres_add) void mpres_add (mpres_t, const mpres_t, const mpres_t, mpmod_t) ATTRIBUTE_HOT; #define mpres_sub_ui __ECM(mpres_sub_ui) void mpres_sub_ui (mpres_t, const mpres_t, const unsigned long, mpmod_t); #define mpres_ui_sub __ECM(mpres_ui_sub) void mpres_ui_sub (mpres_t, const unsigned long, const mpres_t, mpmod_t); #define mpres_sub __ECM(mpres_sub) void mpres_sub (mpres_t, const mpres_t, const mpres_t, mpmod_t) ATTRIBUTE_HOT; #define mpres_set_z __ECM(mpres_set_z) void mpres_set_z (mpres_t, const mpz_t, mpmod_t); #define mpres_get_z __ECM(mpres_get_z) void mpres_get_z (mpz_t, const mpres_t, mpmod_t); #define mpres_set_ui __ECM(mpres_set_ui) void mpres_set_ui (mpres_t, const unsigned long, mpmod_t); #define mpres_set_si __ECM(mpres_set_si) void mpres_set_si (mpres_t, const long, mpmod_t); #define mpres_init __ECM(mpres_init) void mpres_init (mpres_t, const mpmod_t); #define mpres_clear __ECM(mpres_clear) void mpres_clear (mpres_t, const mpmod_t); #define mpres_realloc __ECM(mpres_realloc) void mpres_realloc (mpres_t, const mpmod_t); #define mpres_mul_ui __ECM(mpres_mul_ui) void mpres_mul_ui (mpres_t, const mpres_t, const unsigned long, mpmod_t); #define mpres_mul_2exp __ECM(mpres_mul_2exp) void mpres_mul_2exp (mpres_t, const mpres_t, const unsigned long, mpmod_t); #define mpres_muldivbysomething_si __ECM(mpres_muldivbysomething_si) void mpres_muldivbysomething_si (mpres_t, const mpres_t, const long, mpmod_t); #define mpres_neg __ECM(mpres_neg) void mpres_neg (mpres_t, const mpres_t, mpmod_t); #define mpres_invert __ECM(mpres_invert) int mpres_invert (mpres_t, const mpres_t, mpmod_t); #define mpres_gcd __ECM(mpres_gcd) void mpres_gcd (mpz_t, const mpres_t, const mpmod_t); #define mpres_out_str __ECM(mpres_out_str) void mpres_out_str (FILE *, const unsigned int, const mpres_t, mpmod_t); #define mpres_is_zero __ECM(mpres_is_zero) int mpres_is_zero (const mpres_t, mpmod_t); #define mpres_set(a,b,n) mpz_set (a, b) #define mpres_swap(a,b,n) mpz_swap (a, b) #define mpresn_mul __ECM(mpresn_mul) void mpresn_mul (mpres_t, const mpres_t, const mpres_t, mpmod_t); #define mpresn_addsub __ECM(mpresn_addsub) void mpresn_addsub (mpres_t, mpres_t, const mpres_t, const mpres_t, mpmod_t); #define mpresn_pad __ECM(mpresn_pad) void mpresn_pad (mpres_t R, mpmod_t N); #define mpresn_unpad __ECM(mpresn_unpad) void mpresn_unpad (mpres_t R); #define mpresn_sqr __ECM(mpresn_sqr) void mpresn_sqr (mpres_t, const mpres_t, mpmod_t); #define mpresn_add __ECM(mpresn_add) void mpresn_add (mpres_t, const mpres_t, const mpres_t, mpmod_t); #define mpresn_sub __ECM(mpresn_sub) void mpresn_sub (mpres_t, const mpres_t, const mpres_t, mpmod_t); #define mpresn_mul_1 __ECM(mpresn_mul_ui) void mpresn_mul_1 (mpres_t, const mpres_t, const mp_limb_t, mpmod_t); /* mul_lo.c */ #define ecm_mul_lo_n __ECM(ecm_mul_lo_n) void ecm_mul_lo_n (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); #define ecm_mul_lo_basecase __ECM(ecm_mul_lo_basecase) void ecm_mul_lo_basecase (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); /* median.c */ #define TMulGen __ECM(TMulGen) int TMulGen (listz_t, unsigned int, listz_t, unsigned int, listz_t, unsigned int, listz_t, mpz_t); #define TMulGen_space __ECM(TMulGen_space) unsigned int TMulGen_space (unsigned int, unsigned int, unsigned int); /* schoen_strass.c */ #define DEFAULT 0 #define MONIC 1 #define NOPAD 2 #define F_mul __ECM(F_mul) unsigned int F_mul (mpz_t *, mpz_t *, mpz_t *, unsigned int, int, unsigned int, mpz_t *); #define F_mul_trans __ECM(F_mul_trans) unsigned int F_mul_trans (mpz_t *, mpz_t *, mpz_t *, unsigned int, unsigned int, unsigned int, mpz_t *); #define F_clear __ECM(F_clear) void F_clear (); /* rho.c */ #define rhoinit __ECM(rhoinit) void rhoinit (int, int); #define ecmprob __ECM(ecmprob) double ecmprob (double, double, double, double, int); double pm1prob (double, double, double, double, int, const mpz_t); /* auxlib.c */ #define mpz_add_si __ECM(mpz_add_si) void mpz_add_si (mpz_t, mpz_t, long); #define mpz_sub_si __ECM(mpz_sub_si) void mpz_sub_si (mpz_t, mpz_t, long); #define mpz_divby3_1op __ECM(mpz_divby3_1op) void mpz_divby3_1op (mpz_t); #define double_to_size __ECM(double_to_size) size_t double_to_size (double d); #define cputime __ECM(cputime) long cputime (void); #define realtime __ECM(realtime) long realtime (void); #define elltime __ECM(elltime) long elltime (long, long); #define test_verbose __ECM(test_verbose) int test_verbose (int); #define get_verbose __ECM(get_verbose) int get_verbose (void); #define set_verbose __ECM(set_verbose) void set_verbose (int); #define inc_verbose __ECM(inc_verbose) int inc_verbose (void); #define outputf __ECM(outputf) int outputf (int, char *, ...); #define writechkfile __ECM(writechkfile) void writechkfile (char *, int, double, mpmod_t, mpres_t, mpres_t, mpres_t); /* auxarith.c */ #define gcd __ECM(gcd) unsigned long gcd (unsigned long, unsigned long); #define eulerphi __ECM(eulerphi) unsigned long eulerphi (unsigned long); #define ceil_log2 __ECM(ceil_log2) unsigned int ceil_log2 (unsigned long); #define is_prime __ECM(is_prime) int is_prime (const unsigned long); #define next_prime __ECM(next_prime) unsigned long next_prime (const unsigned long); #define find_factor __ECM(find_factor) unsigned long find_factor (const unsigned long); /* random.c */ #define pp1_random_seed __ECM(pp1_random_seed) void pp1_random_seed (mpz_t, mpz_t, gmp_randstate_t); #define pm1_random_seed __ECM(pm1_random_seed) void pm1_random_seed (mpz_t, mpz_t, gmp_randstate_t); #define get_random_ul __ECM(get_random_ul) unsigned long get_random_ul (void); /* Fgw.c */ #ifdef HAVE_GWNUM int gw_ecm_stage1 (mpz_t, curve *, mpmod_t, double, double *, mpz_t, double, unsigned long, unsigned long, signed long); #endif /* mul_fft.h */ #define mpn_mul_fft __ECM(mpn_mul_fft) int mpn_mul_fft (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, int); #define mpn_mul_fft_full __ECM(mpn_mul_fft_full) void mpn_mul_fft_full (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t); #define mpn_fft_best_k __ECM(mpn_fft_best_k) int mpn_fft_best_k (mp_size_t, int); #define mpn_fft_next_size __ECM(mpn_fft_next_size) mp_size_t mpn_fft_next_size (mp_size_t, int); /* batch.c */ #define compute_s __ECM(compute_s ) void compute_s (mpz_t, unsigned long); #define write_s_in_file __ECM(write_s_in_file) int write_s_in_file (char *, mpz_t); #define read_s_from_file __ECM(read_s_from_file) void read_s_from_file (mpz_t, char *); #define ecm_stage1_batch __ECM(ecm_stage1_batch) int ecm_stage1_batch (mpz_t, mpres_t, mpres_t, mpmod_t, double, double *, int, mpz_t); /* ellparam_batch.c */ #define get_curve_from_ell_parametrization \ __ECM(get_curve_from_ell_parametrization ) int get_curve_from_ell_parametrization (mpz_t, mpres_t, mpz_t, mpmod_t); /* sets_long.c */ /* A set of long ints */ typedef struct { unsigned long card; long elem[1]; } set_long_t; /* A set of sets of long ints */ typedef struct { unsigned long nr; set_long_t sets[1]; } sets_long_t; #define quicksort_long __ECM(quicksort_long) void quicksort_long (long *, unsigned long); #define sets_print __ECM(sets_print) void sets_print (const int, sets_long_t *); #define sets_max __ECM(sets_max) void sets_max (mpz_t, const unsigned long); #define sets_sumset __ECM(sets_sumset) void sets_sumset (set_long_t *, const sets_long_t *); #define sets_sumset_minmax __ECM(sets_sumset_minmax) void sets_sumset_minmax (mpz_t, const sets_long_t *, const int); #define sets_extract __ECM(sets_extract) void sets_extract (sets_long_t *, size_t *, sets_long_t *, const unsigned long); #define sets_get_factored_sorted __ECM(sets_get_factored_sorted) sets_long_t * sets_get_factored_sorted (const unsigned long); /* Return the size in bytes of a set of cardinality c */ #define set_sizeof __ECM(set_sizeof) ATTRIBUTE_UNUSED static size_t set_sizeof (const unsigned long c) { return sizeof (long) + (size_t) c * sizeof (unsigned long); } /* Return pointer to the next set in "*sets" */ ATTRIBUTE_UNUSED static set_long_t * sets_nextset (const set_long_t *sets) { return (set_long_t *) ((char *)sets + sizeof(unsigned long) + sets->card * sizeof(long)); } #if defined (__cplusplus) } #endif /* a <- b * c where a and b are mpz, c is a double, and t an auxiliary mpz */ /* Not sure how the preprocessor handles shifts by more than the integer width on 32 bit machines, so do the shift by 53 in two pieces */ #if (((ULONG_MAX >> 27) >> 26) >= 1) #define mpz_mul_d(a, b, c, t) \ mpz_mul_ui (a, b, (unsigned long int) c); #else #define mpz_mul_d(a, b, c, t) \ if (c < (double) ULONG_MAX) \ mpz_mul_ui (a, b, (unsigned long int) c); \ else { \ mpz_set_d (t, c); \ mpz_mul (a, b, t); } #endif #endif /* _ECM_IMPL_H */ ecm-6.4.4/mul_lo.c0000644023561000001540000000432412106741273010654 00000000000000/* Low-half short product (quadratic and Mulders' algorithms). Copyright 2003, 2005, 2006 Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "ecm-impl.h" /* puts in {rp, n} the low part of {np, n} times {mp, n}, i.e. equivalent to: mp_ptr tp; TMP_DECL(marker); TMP_MARK(marker); tp = TMP_ALLOC_LIMBS (2 * n); mpn_mul_n (tp, np, mp, n); MPN_COPY (rp, tp, n); TMP_FREE(marker); */ void ecm_mul_lo_basecase (mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n) { mpn_mul_1 (rp, np, n, mp[0]); for (; --n;) mpn_addmul_1 (++rp, np, n, (++mp)[0]); } #ifdef MPN_MUL_LO_THRESHOLD_TABLE size_t mpn_mul_lo_threshold[MPN_MUL_LO_THRESHOLD] = MPN_MUL_LO_THRESHOLD_TABLE; #else size_t mpn_mul_lo_threshold[MPN_MUL_LO_THRESHOLD]; #endif void ecm_mul_lo_n (mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n) { mp_size_t k; if (n < MPN_MUL_LO_THRESHOLD) { switch (k = mpn_mul_lo_threshold[n]) { case 0: { mpn_mul_n (rp, np, mp, n); return; } case 1: { ecm_mul_lo_basecase (rp, np, mp, n); return; } /* else go through */ } } else k = (mp_size_t) (0.75 * (double) n); mpn_mul_n (rp, np, mp, k); rp += k; n -= k; ecm_mul_lo_n (rp + n, np + k, mp, n); mpn_add_n (rp, rp, rp + n, n); ecm_mul_lo_n (rp + n, np, mp + k, n); mpn_add_n (rp, rp, rp + n, n); } ecm-6.4.4/powerpc64/0000755023561000001540000000000012113421641011117 500000000000000ecm-6.4.4/powerpc64/powerpc-defs.m40000644023561000001540000000260412106741272013710 00000000000000divert(-1) dnl m4 macros for PowerPC assembler (32 and 64). dnl Inspired from GMP 4.1.4 dnl Copyright 2000 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. dnl Usage: r0 ... r31, cr0 ... cr7 dnl dnl Registers names, either left as "r0" etc or mapped to plain 0 etc, dnl according to the result of GMP_ASM_POWERPC_REGISTERS. define(r0,0) define(r1,1) define(r3,3) define(r4,4) define(r5,5) define(r6,6) define(r7,7) define(r8,8) define(r9,9) define(r10,10) define(r11,11) define(r12,12) define(r13,13) define(r14,14) define(r15,15) define(r16,16) divert ecm-6.4.4/powerpc64/mulredc3.asm0000644023561000001540000002271712113421641013270 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc3(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc3 GLOBL .GSYM_PREFIX`'mulredc3 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc3: .quad .GSYM_PREFIX`'mulredc3, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc3, 24 C Implements multiplication and REDC for two input numbers of 3 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 3] array, having 3+1 8-byte words C The tmp array needs 3+1 entries, but tmp[3] is stored in C r15, so only 3 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'3,`@function') .GSYM_PREFIX`'mulredc3: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 24 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 8(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 16(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 2 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 16(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc3, .-.GSYM_PREFIX`'mulredc3 ecm-6.4.4/powerpc64/mulredc.h0000644023561000001540000000462512106741272012661 00000000000000#ifndef __ASM_REDC_H__ #define __ASM_REDC_H__ #include /* Signals that we have assembly code for variable size redc */ #define HAVE_ASM_REDC3 extern void ecm_redc3(mp_limb_t *, const mp_limb_t *, mp_size_t, mp_limb_t); /* WARNING: the size-1 version doesn't take pointers in input */ extern mp_limb_t mulredc1(mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); extern mp_limb_t mulredc2(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc3(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc4(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc5(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc6(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc7(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc8(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc9(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc10(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc11(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc12(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc13(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc14(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc15(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc16(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc17(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc18(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc19(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); extern mp_limb_t mulredc20(mp_limb_t *, const mp_limb_t *, const mp_limb_t *, const mp_limb_t *, mp_limb_t); #endif ecm-6.4.4/powerpc64/mulredc14.asm0000644023561000001540000006216512113421641013353 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc14 GLOBL .GSYM_PREFIX`'mulredc14 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc14: .quad .GSYM_PREFIX`'mulredc14, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc14, 24 C Implements multiplication and REDC for two input numbers of 14 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 14] array, having 14+1 8-byte words C The tmp array needs 14+1 entries, but tmp[14] is stored in C r15, so only 14 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'14,`@function') .GSYM_PREFIX`'mulredc14: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 112 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 96(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 104(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 13 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 104(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc14, .-.GSYM_PREFIX`'mulredc14 ecm-6.4.4/powerpc64/mulredc5.asm0000644023561000001540000003045512113421641013270 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc5(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc5 GLOBL .GSYM_PREFIX`'mulredc5 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc5: .quad .GSYM_PREFIX`'mulredc5, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc5, 24 C Implements multiplication and REDC for two input numbers of 5 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 5] array, having 5+1 8-byte words C The tmp array needs 5+1 entries, but tmp[5] is stored in C r15, so only 5 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'5,`@function') .GSYM_PREFIX`'mulredc5: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 40 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 24(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 32(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 4 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 32(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc5, .-.GSYM_PREFIX`'mulredc5 ecm-6.4.4/powerpc64/Makefile.dev0000755023561000001540000000176712106741272013301 00000000000000.PHONY: all all: test_mulredc bench CFLAGS:=-m64 -mcpu=970 -O3 ALLMULRED:= mulredc1.o mulredc2.o mulredc3.o mulredc4.o mulredc5.o\ mulredc6.o mulredc7.o mulredc8.o mulredc9.o mulredc10.o\ mulredc11.o mulredc12.o mulredc13.o mulredc14.o\ mulredc15.o mulredc16.o mulredc17.o mulredc18.o\ mulredc19.o mulredc20.o redc.s: redc.asm m4 -I../ redc.asm > redc.s redc.o: redc.s gcc -c $(CFLAGS) redc.s -o redc.o mulredc%.o: mulredc%.asm m4 $< > tmp-mulred.s gcc -c $(CFLAGS) tmp-mulred.s -o $@ rm tmp-mulred.s mulredc1.asm: ./mulredc_1_2.m4 m4 -DLENGTH=1 $< > $@ mulredc2.asm: ./mulredc_1_2.m4 m4 -DLENGTH=2 $< > $@ mulredc%.asm: ./mulredc.m4 m4 -DLENGTH=$* $< > $@ test_mulredc: test_mulredc.c redc.o $(ALLMULRED) gcc -o test_mulredc $(CFLAGS) test_mulredc.c $(ALLMULRED) redc.o -lgmp bench: bench.c redc.o $(ALLMULRED) gcc -o bench $(CFLAGS) bench.c $(ALLMULRED) redc.o -lgmp clean: rm redc.s *.o mulredc[0-9]*.s mulredc[0-9]*.asm test_mulredc ecm-6.4.4/powerpc64/generate_all0000755023561000001540000000030012106741272013407 00000000000000#!/bin/sh for i in 1 2; do m4 -DLENGTH=$i mulredc_1_2.m4 > mulredc$i.asm done for i in 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; do m4 -DLENGTH=$i mulredc.m4 > mulredc$i.asm done ecm-6.4.4/powerpc64/mulredc6.asm0000644023561000001540000003333412113421641013270 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc6(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc6 GLOBL .GSYM_PREFIX`'mulredc6 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc6: .quad .GSYM_PREFIX`'mulredc6, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc6, 24 C Implements multiplication and REDC for two input numbers of 6 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 6] array, having 6+1 8-byte words C The tmp array needs 6+1 entries, but tmp[6] is stored in C r15, so only 6 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'6,`@function') .GSYM_PREFIX`'mulredc6: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 48 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 32(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 40(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 5 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 40(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc6, .-.GSYM_PREFIX`'mulredc6 ecm-6.4.4/powerpc64/mulredc2.asm0000644023561000001540000001107612113421641013263 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc2(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc2 GLOBL .GSYM_PREFIX`'mulredc2 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc2: .quad .GSYM_PREFIX`'mulredc2, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc2, 24 TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'2,`@function') .GSYM_PREFIX`'mulredc2: ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result zero mulld r8, r0, r12 C x[0]*y[1] low half adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[0]*y[1] high half ld r0, 8(r6) C m[1] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[1] low adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulhdu r9, r0, r11 C U*m[1] high ldu r12, 8(r4) C x[1] ld r0, 0(r5) C y[0] addc r13, r8, r13 C add T0 and low word mulld r8, r0, r12 C x[1]*y[0] low half adde r14, r9, r14 C add high word with carry to T1 addze r15, r16 C put carry in r15 (tmp[len] <= 1) mulhdu r9, r0, r12 C x[1]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending mulld r8, r0, r12 C x[1]*y[1] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[1]*y[1] high half ld r0, 8(r6) C m[1] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[1] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[1] high addc r8, r8, r13 C add T0 and low word adde r9, r9, r14 C T1, carry pending std r8, 0(r3) C copy result to z stdu r9, 8(r3) addze r3, r10 C return tmp(len) ld r16, 0(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc2, .-.GSYM_PREFIX`'mulredc2 ecm-6.4.4/powerpc64/mulredc.m40000755023561000001540000002543212110743510012744 00000000000000`dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ******************************************************************************' dnl Use `C' to remove comments in .asm -> .s conversion. dnl Copied from GMP 4.2. `define(C, ` dnl')' C mp_limb_t mulredc`'LENGTH`'(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 divert(-1) dnl forloop(i, from, to, stmt) define(`forloop', `pushdef(`$1', `$2')_forloop(`$1', `$2', `$3', `$4')popdef(`$1')') define(`_forloop', `$4`'ifelse($1, `$3', , `define(`$1', incr($1))_forloop(`$1', `$2', `$3', `$4')')') divert `include(`config.m4')' GLOBL GSYM_PREFIX``''mulredc`'LENGTH GLOBL .GSYM_PREFIX``''mulredc`'LENGTH .section ".opd", "aw" .align 3 GSYM_PREFIX``''mulredc`'LENGTH: .quad .GSYM_PREFIX``''mulredc`'LENGTH, .TOC.@tocbase, 0 .size GSYM_PREFIX``''mulredc`'LENGTH, 24 C Implements multiplication and REDC for two input numbers of LENGTH words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... LENGTH] array, having LENGTH+1 8-byte words C The tmp array needs LENGTH+1 entries, but tmp[LENGTH] is stored in C r15, so only LENGTH entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX``''mulredc``''LENGTH,``@function'') .GSYM_PREFIX``''mulredc`'LENGTH: define(`S', `eval(8 * LENGTH)')dnl C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, S C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 forloop(`UNROLL', 1, eval(LENGTH - 2), `dnl define(`J', `eval(8 * UNROLL)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl C Pass for j = UNROLL mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, J`'(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, J8`'(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, JM8`'(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 ')dnl end forloop C Pass for j = eval(LENGTH - 1). Don't fetch new data from y[j+1]. define(`J', `eval(8*LENGTH - 8)')dnl define(`JM8', `eval(J - 8)')dnl mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, J`'(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, JM8`'(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, J`'(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### define(`LM1', `eval(LENGTH - 1)')dnl li r9, LM1 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 forloop(`UNROLL', 1, eval(LENGTH - 2), `dnl define(`J', `eval(8 * UNROLL)')dnl define(`J8', `eval(J + 8)')dnl define(`JM8', `eval(J - 8)')dnl C Pass for j = UNROLL ld r14, J8`'(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, J`'(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, J8`'(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, JM8`'(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 ')dnl end forloop C Pass for j = eval(LENGTH - 1). Don't fetch new data from y[j+1]. define(`J', `eval(8*LENGTH - 8)')dnl define(`JM8', `eval(J - 8)')dnl mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, J`'(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, JM8`'(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, J`'(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z dnl ==== THIS LOOP WILL NOT WORK FOR LENGTH <= 1 ==== forloop(`UNROLL', 0, eval(LENGTH / 2 - 1), `dnl define(`J', `eval(UNROLL)')dnl ifelse(J, `0', dnl ` ld r8, 0(r1)', dnl ` ldu r8, 8(r1)') ldu r9, 8(r1) ifelse(J, `0', dnl ` std r8, 0(r3)', dnl ` stdu r8, 8(r3)') stdu r9, 8(r3) ')dnl ifelse(eval(LENGTH % 2), 1, `dnl ldu r8, 8(r1) stdu r8, 8(r3) ')dnl mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX``''mulredc`'LENGTH, .-.GSYM_PREFIX``''mulredc`'LENGTH ecm-6.4.4/powerpc64/mulredc20.asm0000644023561000001540000010330512113421641013340 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc20(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc20 GLOBL .GSYM_PREFIX`'mulredc20 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc20: .quad .GSYM_PREFIX`'mulredc20, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc20, 24 C Implements multiplication and REDC for two input numbers of 20 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 20] array, having 20+1 8-byte words C The tmp array needs 20+1 entries, but tmp[20] is stored in C r15, so only 20 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'20,`@function') .GSYM_PREFIX`'mulredc20: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 160 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 15 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 16 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 17 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 144(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 128(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 18 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 144(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 152(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 136(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 19. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 152(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 144(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 152(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 19 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14 ld r14, 120(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 15 ld r14, 128(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 16 ld r14, 136(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 17 ld r14, 144(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 144(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 128(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 18 ld r14, 152(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 144(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 152(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 136(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 19. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 152(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 144(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 152(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc20, .-.GSYM_PREFIX`'mulredc20 ecm-6.4.4/powerpc64/mulredc17.asm0000644023561000001540000007263512113421641013361 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc17(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc17 GLOBL .GSYM_PREFIX`'mulredc17 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc17: .quad .GSYM_PREFIX`'mulredc17, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc17, 24 C Implements multiplication and REDC for two input numbers of 17 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 17] array, having 17+1 8-byte words C The tmp array needs 17+1 entries, but tmp[17] is stored in C r15, so only 17 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'17,`@function') .GSYM_PREFIX`'mulredc17: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 136 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 15 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 16. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 120(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 128(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 16 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14 ld r14, 120(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 15 ld r14, 128(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 16. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 120(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 128(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc17, .-.GSYM_PREFIX`'mulredc17 ecm-6.4.4/powerpc64/redc.asm0000755023561000001540000003076312110743510012471 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** dnl dnl void ecm_redc3(mp_limb_t * c, const mp_limb_t * m, size_t n, mp_limb_t m_inv) dnl dnl input arguments: dnl dnl r3: ptr to c[0], the least significant word of the number to be reduced dnl c[0 ... 2*n-1] is of length 2*n words dnl r4: ptr to m[0], the least significant word of the modulus m of length n dnl r5: the length n dnl r6: m_inv = -1/m mod 2^64 dnl dnl the residue (before adding the word carries) will be in c[n ... 2*n-1]. dnl c[0 ... n-1] will contain the high word carries from each inner loop pass. dnl These carry words are added by the calling routine to obtain the final dnl residue. dnl Use `C' to remove comments in .asm -> .s conversion. dnl Copied from GMP 4.2. define(C, ` dnl') include(`config.m4') GLOBL GSYM_PREFIX`'ecm_redc3 GLOBL .GSYM_PREFIX`'ecm_redc3 .section ".opd", "aw" .align 3 GSYM_PREFIX`'ecm_redc3: .quad .GSYM_PREFIX`'ecm_redc3, .TOC.@tocbase, 0 .size GSYM_PREFIX`'ecm_redc3, 24 TEXT .align 5 C 32 byte alignment TYPE(.GSYM_PREFIX`'ecm_redc3,`@function') .GSYM_PREFIX`'ecm_redc3: cmpdi r5, 1 C length = 1? bne 1f ld r12, 0(r3) C c[0] ld r0, 0(r4) C m[0] mulld r7, r6, r12 C u = c[0] * m_inv mod 2^64 mulld r11, r0, r7 C m[0]*u low mulhdu r10, r0, r7 C m[0]*u high addc r11, r11, r12 C c[0] + m[0]*u low = 0 addze r10, r10 C carry to high half std r10, 0(r3) C store the "carry" word blr nop nop nop nop nop 1: mflr r0 C save return addr stdu r0, -8(r1) C on the stack stdu r13, -8(r1) C save r13 dnl dnl get inner loop count and jump offset dnl subi r7, r5, 2 C r7 = n - 2 andi. r8, r7, 15 C r8 = (n - 2) mod 16 sldi r8, r8, 5 C r8 * 32 = byte offset srdi r7, r7, 4 C int((n - 2)/16) dnl dnl compute the address of inner loop end and subtract the offset dnl bl nxt C put the address of the next instruction C into the link register nxt: C mflr r9 C r9 = address of this instruction addi r9, r9, 640 C add offset to v_1 from nxt C WARNING: any changes to the code between C the labels "nxt" and "v_1" may require C recomputation of the offset above. sub r9, r9, r8 C offset back to desired starting point mtlr r9 C and now we can branch directly to our target mtctr r5 C outer loop count n addi r13, r7, 1 C inner loop counter nop nop OuterLoop: C execute n times dnl compute u, set addr's ld r12, 0(r3) C c[0] mr r8, r4 C r8 = working copy of m address ld r0, 0(r8) C m[0] mulld r7, r6, r12 C u = c[0] * m_inv mod 2^64 mfctr r5 C save current outer loop count dnl start inner mulld r11, r0, r7 C m[0]*u low mtctr r13 C inner loop count mulhdu r10, r0, r7 C m[0]*u high ldu r0, 8(r8) C m[1] addc r11, r11, r12 C m[0]*u low + c[0] (don't bother storing zero) mulld r11, r0, r7 C m[1]*u low ldu r12, 8(r3) C c[1], update c address mr r9, r3 C r9 = working copy of c addr mulhdu r0, r0, r7 C m[1]*u high adde r11, r10, r11 C m[1]*u low + m[0]*u high + cy addze r10, r0 C m[1]*u high + cy blr C jump to start of the (n-2) mod 16 section C (or to v_1, if (n-2) mod 16 = 0) nop nop nop nop nop nop nop ILoop: ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 15 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 14 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 13 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 12 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 11 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 10 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 9 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 8 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 7 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 6 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 5 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 4 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 3 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 2 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy dnl dnl start (n-2) mod 16 = 1 dnl ldu r0, 8(r8) C m[i] addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] mulld r11, r0, r7 C m[i]*u low ldu r12, 8(r9) C c[i] mulhdu r0, r0, r7 C m[i]*u high adde r11, r10, r11 C m[i]*u low + m[i-1]*u high + cy addze r10, r0 C r10 = m[i]*u + cy v_1: bdnz ILoop C blr above jumps directly to this bdnz instruction C when (n-2) mod 16 = 0 dnl finish inner addc r11, r11, r12 C m[i-1]*u low + m[i-2]*u high + c[i-1] std r11, 0(r9) C store it in c[i-1] addze r10, r10 C result cy = 0 always std r10, -8(r3) C store the "carry" word mtctr r5 C restore outer loop count bdnz OuterLoop ld r13, 0(r1) C restore r13 ld r0, 8(r1) C original return address addi r1, r1, 16 C restore stack ptr mtlr r0 blr .size .GSYM_PREFIX`'ecm_redc3, .-.GSYM_PREFIX`'ecm_redc3 ecm-6.4.4/powerpc64/mulredc18.asm0000644023561000001540000007552512113421641013363 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc18(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc18 GLOBL .GSYM_PREFIX`'mulredc18 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc18: .quad .GSYM_PREFIX`'mulredc18, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc18, 24 C Implements multiplication and REDC for two input numbers of 18 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 18] array, having 18+1 8-byte words C The tmp array needs 18+1 entries, but tmp[18] is stored in C r15, so only 18 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'18,`@function') .GSYM_PREFIX`'mulredc18: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 144 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 15 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 16 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 17. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 128(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 136(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 17 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14 ld r14, 120(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 15 ld r14, 128(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 16 ld r14, 136(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 17. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 128(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 136(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc18, .-.GSYM_PREFIX`'mulredc18 ecm-6.4.4/powerpc64/mulredc19.asm0000644023561000001540000010041512113421641013347 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc19(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc19 GLOBL .GSYM_PREFIX`'mulredc19 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc19: .quad .GSYM_PREFIX`'mulredc19, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc19, 24 C Implements multiplication and REDC for two input numbers of 19 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 19] array, having 19+1 8-byte words C The tmp array needs 19+1 entries, but tmp[19] is stored in C r15, so only 19 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'19,`@function') .GSYM_PREFIX`'mulredc19: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 152 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 15 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 16 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 17 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 144(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 128(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 18. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 144(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 136(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 144(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 18 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14 ld r14, 120(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 15 ld r14, 128(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 128(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 112(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 16 ld r14, 136(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 128(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 136(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 120(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 17 ld r14, 144(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 136(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 144(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 128(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 18. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 144(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 136(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 144(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc19, .-.GSYM_PREFIX`'mulredc19 ecm-6.4.4/powerpc64/mulredc9.asm0000644023561000001540000004375112113421641013277 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc9(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc9 GLOBL .GSYM_PREFIX`'mulredc9 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc9: .quad .GSYM_PREFIX`'mulredc9, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc9, 24 C Implements multiplication and REDC for two input numbers of 9 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 9] array, having 9+1 8-byte words C The tmp array needs 9+1 entries, but tmp[9] is stored in C r15, so only 9 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'9,`@function') .GSYM_PREFIX`'mulredc9: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 72 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 56(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 64(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 8 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 64(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc9, .-.GSYM_PREFIX`'mulredc9 ecm-6.4.4/powerpc64/mulredc13.asm0000644023561000001540000005727512113421641013360 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc13(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc13 GLOBL .GSYM_PREFIX`'mulredc13 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc13: .quad .GSYM_PREFIX`'mulredc13, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc13, 24 C Implements multiplication and REDC for two input numbers of 13 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 13] array, having 13+1 8-byte words C The tmp array needs 13+1 entries, but tmp[13] is stored in C r15, so only 13 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'13,`@function') .GSYM_PREFIX`'mulredc13: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 104 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 88(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 96(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 12 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 96(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc13, .-.GSYM_PREFIX`'mulredc13 ecm-6.4.4/powerpc64/mulredc12.asm0000644023561000001540000005441312113421641013346 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc12 GLOBL .GSYM_PREFIX`'mulredc12 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc12: .quad .GSYM_PREFIX`'mulredc12, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc12, 24 C Implements multiplication and REDC for two input numbers of 12 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 12] array, having 12+1 8-byte words C The tmp array needs 12+1 entries, but tmp[12] is stored in C r15, so only 12 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'12,`@function') .GSYM_PREFIX`'mulredc12: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 96 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 80(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 88(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 11 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 88(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc12, .-.GSYM_PREFIX`'mulredc12 ecm-6.4.4/powerpc64/mulredc16.asm0000644023561000001540000006774512113421641013366 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc16(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc16 GLOBL .GSYM_PREFIX`'mulredc16 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc16: .quad .GSYM_PREFIX`'mulredc16, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc16, 24 C Implements multiplication and REDC for two input numbers of 16 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 16] array, having 16+1 8-byte words C The tmp array needs 16+1 entries, but tmp[16] is stored in C r15, so only 16 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'16,`@function') .GSYM_PREFIX`'mulredc16: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 128 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 15. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 112(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 120(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 15 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14 ld r14, 120(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 120(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 15. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 120(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 112(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 120(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc16, .-.GSYM_PREFIX`'mulredc16 ecm-6.4.4/powerpc64/mulredc1.asm0000644023561000001540000000411712113421641013260 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y, C const mp_limb_t m, mp_limb_t inv_m); C C arguments: C r3 : ptr to result z C r4 : input x C r5 : input y C r6 : modulus m' C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc1 GLOBL .GSYM_PREFIX`'mulredc1 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc1: .quad .GSYM_PREFIX`'mulredc1, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc1, 24 TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'1,`@function') .GSYM_PREFIX`'mulredc1: mulld r8, r4, r5 C x*y low half T0 mulhdu r9, r4, r5 C x*y high half T1 mulld r0, r7, r8 C u = t0 * invm mulld r10, r0, r6 C u*m low mulhdu r11, r0, r6 C u*m high addc r8, r8, r10 C x*y + u*m low (= zero) adde r9, r9, r11 C result std r9, 0(r3) C store in z addze r3, r8 C return carry blr .size .GSYM_PREFIX`'mulredc1, .-.GSYM_PREFIX`'mulredc1 ecm-6.4.4/powerpc64/mulredc_1_2.m40000755023561000001540000001317712110743510013410 00000000000000`dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ******************************************************************************' dnl Use `C' to remove comments in .asm -> .s conversion. dnl Copied from GMP 4.2. `define(C, ` dnl')' ifelse(eval(LENGTH),1, C mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y, C const mp_limb_t m, mp_limb_t inv_m); C C arguments: C r3 : ptr to result z C r4 : input x C r5 : input y C r6 : modulus m', `C mp_limb_t mulredc'LENGTH`(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb') C r7 = -1/m mod 2^64 C C final carry returned in r3 divert(-1) dnl forloop(i, from, to, stmt) define(`forloop', `pushdef(`$1', `$2')_forloop(`$1', `$2', `$3', `$4')popdef(`$1')') define(`_forloop', `$4`'ifelse($1, `$3', , `define(`$1', incr($1))_forloop(`$1', `$2', `$3', `$4')')') divert `include(`config.m4')' GLOBL GSYM_PREFIX``''mulredc`'LENGTH GLOBL .GSYM_PREFIX``''mulredc`'LENGTH .section ".opd", "aw" .align 3 GSYM_PREFIX``''mulredc`'LENGTH: .quad .GSYM_PREFIX``''mulredc`'LENGTH, .TOC.@tocbase, 0 .size GSYM_PREFIX``''mulredc`'LENGTH, 24 TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX``''mulredc``''LENGTH,``@function'') .GSYM_PREFIX``''mulredc`'LENGTH: ifelse(eval(LENGTH),1, ` mulld r8, r4, r5 C x*y low half T0 mulhdu r9, r4, r5 C x*y high half T1 mulld r0, r7, r8 C u = t0 * invm mulld r10, r0, r6 C u*m low mulhdu r11, r0, r6 C u*m high addc r8, r8, r10 C x*y + u*m low (= zero) adde r9, r9, r11 C result std r9, 0(r3) C store in z addze r3, r8 C return carry blr', eval(LENGTH),2, ` ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result zero mulld r8, r0, r12 C x[0]*y[1] low half adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[0]*y[1] high half ld r0, 8(r6) C m[1] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[1] low adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulhdu r9, r0, r11 C U*m[1] high ldu r12, 8(r4) C x[1] ld r0, 0(r5) C y[0] addc r13, r8, r13 C add T0 and low word mulld r8, r0, r12 C x[1]*y[0] low half adde r14, r9, r14 C add high word with carry to T1 addze r15, r16 C put carry in r15 (tmp[len] <= 1) mulhdu r9, r0, r12 C x[1]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending mulld r8, r0, r12 C x[1]*y[1] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[1]*y[1] high half ld r0, 8(r6) C m[1] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[1] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[1] high addc r8, r8, r13 C add T0 and low word adde r9, r9, r14 C T1, carry pending std r8, 0(r3) C copy result to z stdu r9, 8(r3) addze r3, r10 C return tmp(len) ld r16, 0(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr') .size .GSYM_PREFIX``''mulredc`'LENGTH, .-.GSYM_PREFIX``''mulredc`'LENGTH ecm-6.4.4/powerpc64/mulredc15.asm0000644023561000001540000006505512113421641013355 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc15 GLOBL .GSYM_PREFIX`'mulredc15 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc15: .quad .GSYM_PREFIX`'mulredc15, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc15, 24 C Implements multiplication and REDC for two input numbers of 15 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 15] array, having 15+1 8-byte words C The tmp array needs 15+1 entries, but tmp[15] is stored in C r15, so only 15 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'15,`@function') .GSYM_PREFIX`'mulredc15: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 120 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 11 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 12 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 13 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 14. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 104(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 112(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 14 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10 ld r14, 88(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 88(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 11 ld r14, 96(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 88(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 96(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 80(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 12 ld r14, 104(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 96(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 104(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 88(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 13 ld r14, 112(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 104(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 112(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 96(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 14. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 112(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 104(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 112(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc15, .-.GSYM_PREFIX`'mulredc15 ecm-6.4.4/powerpc64/mulredc10.asm0000644023561000001540000004665012113421641013350 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc10(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc10 GLOBL .GSYM_PREFIX`'mulredc10 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc10: .quad .GSYM_PREFIX`'mulredc10, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc10, 24 C Implements multiplication and REDC for two input numbers of 10 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 10] array, having 10+1 8-byte words C The tmp array needs 10+1 entries, but tmp[10] is stored in C r15, so only 10 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'10,`@function') .GSYM_PREFIX`'mulredc10: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 80 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 64(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 72(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 9 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 72(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc10, .-.GSYM_PREFIX`'mulredc10 ecm-6.4.4/powerpc64/Makefile.am0000755023561000001540000000445512106741272013115 00000000000000MULREDC = mulredc1.asm mulredc2.asm mulredc3.asm mulredc4.asm mulredc5.asm \ mulredc6.asm mulredc7.asm mulredc8.asm mulredc9.asm mulredc10.asm \ mulredc11.asm mulredc12.asm mulredc13.asm mulredc14.asm \ mulredc15.asm mulredc16.asm mulredc17.asm mulredc18.asm \ mulredc19.asm mulredc20.asm EXTRA_DIST = Makefile.dev README mulredc_1_2.m4 mulredc.m4 generate_all \ powerpc-defs.m4 noinst_LTLIBRARIES = libmulredc.la noinst_HEADERS = mulredc.h # This library definition also causes the mulredc[n].asm and redc.asm files # to go in the distribution - no need for having them in EXTRA_DIST libmulredc_la_SOURCES = $(MULREDC) redc.asm # It's actually the .s files that depend on config.m4, but automake # knows them only as intermediate files, not as targets. Adding the # dependency to libmulredc.la should work so long as no stale .s # files exist. libmulredc_la_DEPENDENCIES = $(top_builddir)/config.m4 # The asm code does not depend on any libraries except libc for abort() # if assertions are enabled LIBS = LDFLAGS = # Rules for generating the .asm files from the .m4 scripts mulredc1.asm: mulredc_1_2.m4 $(M4) -DLENGTH=1 $< > $@ mulredc2.asm: mulredc_1_2.m4 $(M4) -DLENGTH=2 $< > $@ mulredc3.asm: mulredc.m4 $(M4) -DLENGTH=3 $< > $@ mulredc4.asm: mulredc.m4 $(M4) -DLENGTH=4 $< > $@ mulredc5.asm: mulredc.m4 $(M4) -DLENGTH=5 $< > $@ mulredc6.asm: mulredc.m4 $(M4) -DLENGTH=6 $< > $@ mulredc7.asm: mulredc.m4 $(M4) -DLENGTH=7 $< > $@ mulredc8.asm: mulredc.m4 $(M4) -DLENGTH=8 $< > $@ mulredc9.asm: mulredc.m4 $(M4) -DLENGTH=9 $< > $@ mulredc10.asm: mulredc.m4 $(M4) -DLENGTH=10 $< > $@ mulredc11.asm: mulredc.m4 $(M4) -DLENGTH=11 $< > $@ mulredc12.asm: mulredc.m4 $(M4) -DLENGTH=12 $< > $@ mulredc13.asm: mulredc.m4 $(M4) -DLENGTH=13 $< > $@ mulredc14.asm: mulredc.m4 $(M4) -DLENGTH=14 $< > $@ mulredc15.asm: mulredc.m4 $(M4) -DLENGTH=15 $< > $@ mulredc16.asm: mulredc.m4 $(M4) -DLENGTH=16 $< > $@ mulredc17.asm: mulredc.m4 $(M4) -DLENGTH=17 $< > $@ mulredc18.asm: mulredc.m4 $(M4) -DLENGTH=18 $< > $@ mulredc19.asm: mulredc.m4 $(M4) -DLENGTH=19 $< > $@ mulredc20.asm: mulredc.m4 $(M4) -DLENGTH=20 $< > $@ .asm.s: $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.s #.asm.S: # $(M4) -I../ -DOPERATION_$* `test -f $< || echo '$(srcdir)/'`$< >$*.S ecm-6.4.4/powerpc64/mulredc4.asm0000644023561000001540000002557612113421641013277 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc4(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc4 GLOBL .GSYM_PREFIX`'mulredc4 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc4: .quad .GSYM_PREFIX`'mulredc4, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc4, 24 C Implements multiplication and REDC for two input numbers of 4 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 4] array, having 4+1 8-byte words C The tmp array needs 4+1 entries, but tmp[4] is stored in C r15, so only 4 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'4,`@function') .GSYM_PREFIX`'mulredc4: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 32 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 16(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 24(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 3 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 24(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc4, .-.GSYM_PREFIX`'mulredc4 ecm-6.4.4/powerpc64/mulredc8.asm0000644023561000001540000004107212113421641013270 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc8 GLOBL .GSYM_PREFIX`'mulredc8 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc8: .quad .GSYM_PREFIX`'mulredc8, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc8, 24 C Implements multiplication and REDC for two input numbers of 8 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 8] array, having 8+1 8-byte words C The tmp array needs 8+1 entries, but tmp[8] is stored in C r15, so only 8 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'8,`@function') .GSYM_PREFIX`'mulredc8: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 64 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 48(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 56(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 7 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 56(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc8, .-.GSYM_PREFIX`'mulredc8 ecm-6.4.4/powerpc64/README0000755023561000001540000000176012106741272011735 00000000000000The files in this directory (powerpc64) were contributed by Philip McLaughlin . They are distributed under the LGPL license, whose text is in ../COPYING.LIB. mulredc[1..20].asm are size-specific asm functions for mulredc. Sizes 1 and 2 may be regenerated by the m4 script mulredc_1_2.m4. Sizes 3 through 20 may be regenerated by the m4 script mulredc.m4. This generation is not done automatically with the autoconf/automake stuff. If you need to regenerate them, the syntax is (for 1 and 2) m4 -DLENGTH=1 mulredc_1_2.m4 > mulredc1.asm m4 -DLENGTH=2 mulredc_1_2.m4 > mulredc2.asm (for 3 through 20) m4 -DLENGTH=3 mulredc.m4 > mulredc3.asm m4 -DLENGTH=4 mulredc.m4 > mulredc4.asm ... etc., up to LENGTH=20. If you have problems, you should reconfigure with the --disable-asm-redc option. redc.asm is a version of redc separated from the multiplication, since there are cases where it is needed. test_mulredc.c, bench.c and the Makefile.dev are for development. ecm-6.4.4/powerpc64/mulredc11.asm0000644023561000001540000005153212113421641013344 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc11(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc11 GLOBL .GSYM_PREFIX`'mulredc11 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc11: .quad .GSYM_PREFIX`'mulredc11, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc11, 24 C Implements multiplication and REDC for two input numbers of 11 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 11] array, having 11+1 8-byte words C The tmp array needs 11+1 entries, but tmp[11] is stored in C r15, so only 11 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'11,`@function') .GSYM_PREFIX`'mulredc11: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 88 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 7 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 8 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 9 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 10. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 72(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 80(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 10 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6 ld r14, 56(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 56(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 7 ld r14, 64(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 56(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 64(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 48(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 8 ld r14, 72(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 64(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 72(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 56(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 9 ld r14, 80(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 72(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 80(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 64(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 10. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 80(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 72(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 80(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc11, .-.GSYM_PREFIX`'mulredc11 ecm-6.4.4/powerpc64/mulredc7.asm0000644023561000001540000003621312113421641013270 00000000000000dnl ****************************************************************************** dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. dnl dnl This file is part of the ECM Library. dnl dnl The ECM Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published by dnl the Free Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl The ECM Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the ECM Library; see the file COPYING.LIB. If not, write to dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, dnl MA 02110-1301, USA. dnl ****************************************************************************** define(C, ` dnl') C mp_limb_t mulredc7(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, C const mp_limb_t *m, mp_limb_t inv_m); C C arguments: C r3 = ptr to result z least significant limb C r4 = ptr to input x least significant limb C r5 = ptr to input y least significant limb C r6 = ptr to modulus m least significant limb C r7 = -1/m mod 2^64 C C final carry returned in r3 include(`config.m4') GLOBL GSYM_PREFIX`'mulredc7 GLOBL .GSYM_PREFIX`'mulredc7 .section ".opd", "aw" .align 3 GSYM_PREFIX`'mulredc7: .quad .GSYM_PREFIX`'mulredc7, .TOC.@tocbase, 0 .size GSYM_PREFIX`'mulredc7, 24 C Implements multiplication and REDC for two input numbers of 7 words C The algorithm: C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) C C T1:T0 = x[i]*y[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 (see note 2) */ C for (i = 1; i < len; i++) C { C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; C u = (T0*invm) % 2^64 ; C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ C for (j = 1; j < len; j++) C { C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ C tmp[j-1] = T0; C } C tmp[len-1] = T1 ; C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ C } C z[0 ... len-1] = tmp[0 ... len-1] ; C return (tmp[len]) ; C C notes: C C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, C so cy:T1 <= 2*2^64 - 4. C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), C so cy:T1 <= 2*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), C so cy:T1 <= 3*2^64 - 3. For j > 1, C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. C Assume this is true for index i-1, Then C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. C C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 C YP = r5, MP = r6, TP = r1 (stack ptr) C C local variables: tmp[0 ... 7] array, having 7+1 8-byte words C The tmp array needs 7+1 entries, but tmp[7] is stored in C r15, so only 7 entries are used in the stack. TEXT .align 5 C powerPC 32 byte alignment TYPE(.GSYM_PREFIX`'mulredc`'7,`@function') .GSYM_PREFIX`'mulredc7: C ######################################################################## C # i = 0 pass C ######################################################################### C Pass for j = 0. We need to fetch x[i] from memory and compute the new u ld r12, 0(r4) C XI = x[0] ld r0, 0(r5) C y[0] stdu r13, -8(r1) C save r13 mulld r8, r0, r12 C x[0]*y[0] low half stdu r14, -8(r1) C save r14 mulhdu r9, r0, r12 C x[0]*y[0] high half ld r0, 0(r6) C m[0] mulld r11, r7, r8 C U = T0*invm mod 2^64 stdu r15, -8(r1) C save r15 mulld r13, r0, r11 C T0 = U*m[0] low stdu r16, -8(r1) C save r16 li r16, 0 C set r16 to zero for carry propagation subi r1, r1, 56 C set tmp stack space mulhdu r14, r0, r11 C T1 = U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C adde r13, r9, r14 C T0 = initial tmp(0) addze r10, r16 C carry to CY C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence C CY:T1 <= 2*2^64 - 4 C Pass for j = 1 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 2 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 3 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 4 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 5 mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C add high word with carry to T1 addze r10, r16 C carry to CY std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 C Pass for j = 6. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 adde r14, r9, r10 C add high word with carry + CY to T1 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! mulld r8, r0, r11 C U*m[j] low mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C add high word with carry to T1 std r8, 40(r1) C store tmp[len-2] addze r15, r16 C put carry in r15 (tmp[len] <= 1) std r13, 48(r1) C store tmp[len-1] C ######################################################################### C # i > 0 passes C ######################################################################### li r9, 6 C outer loop count mtctr r9 1: C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory C and compute the new u ldu r12, 8(r4) C x[i] ld r0, 0(r5) C y[0] ld r13, 0(r1) C tmp[0] mulld r8, r0, r12 C x[i]*y[0] low half ld r14, 8(r1) C tmp[1] mulhdu r9, r0, r12 C x[i]*y[0] high half addc r13, r8, r13 C T0 ld r0, 0(r6) C m[0] mulld r11, r7, r13 C U = T0*invm mod 2^64 adde r14, r9, r14 C T1 mulld r8, r0, r11 C U*m[0] low addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[0] high ld r0, 8(r5) C y[1] addc r8, r8, r13 C result = 0 adde r13, r9, r14 C T0, carry pending C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, C so cy:T1 <= 3*2^64 - 4 C Pass for j = 1 ld r14, 16(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 8(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 16(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 0(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 2 ld r14, 24(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 16(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 24(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 8(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 3 ld r14, 32(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 24(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 32(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 16(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 4 ld r14, 40(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 32(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 40(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 24(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 5 ld r14, 48(r1) C tmp[j+1] mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r14, r10 C tmp[j+1] + CY + pending carry addze r10, r16 C carry to CY mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 40(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r10 C add carry to CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word ld r0, 48(r5) C y[j+1] adde r13, r9, r14 C T1, carry pending std r8, 32(r1) C store tmp[j-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 C Pass for j = 6. Don't fetch new data from y[j+1]. mulld r8, r0, r12 C x[i]*y[j] low half adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry C since tmp[len] <= 1, T1 <= 3 and carry is zero mulhdu r9, r0, r12 C x[i]*y[j] high half ld r0, 48(r6) C m[j] addc r13, r8, r13 C add low word to T0 mulld r8, r0, r11 C U*m[j] low adde r14, r9, r14 C add high to T1 addze r10, r16 C CY mulhdu r9, r0, r11 C U*m[j] high addc r8, r8, r13 C add T0 and low word adde r13, r9, r14 C T1, carry pending std r8, 40(r1) C store tmp[len-2] addze r15, r10 C store tmp[len] <= 1 std r13, 48(r1) C store tmp[len-1] C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) bdnz 1b C Copy result from tmp memory to z ld r8, 0(r1) ldu r9, 8(r1) std r8, 0(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) ldu r9, 8(r1) stdu r8, 8(r3) stdu r9, 8(r3) ldu r8, 8(r1) stdu r8, 8(r3) mr r3, r15 C return tmp(len) ldu r16, 8(r1) ldu r15, 8(r1) ldu r14, 8(r1) ldu r13, 8(r1) addi r1, r1, 8 blr .size .GSYM_PREFIX`'mulredc7, .-.GSYM_PREFIX`'mulredc7 ecm-6.4.4/auxlib.c0000644023561000001540000001465412106741273010660 00000000000000/* Auxiliary routines for the ecm library. Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2011 Paul Zimmermann, Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ /* need stdio.h and stdarg.h for gmp.h to declare gmp_vfprintf */ #include #include #include #include "ecm-impl.h" #if TIME_WITH_SYS_TIME # include # include #else # if HAVE_SYS_TIME_H # include # else # include # endif #endif #ifdef HAVE_LIMITS_H # include #else # ifndef ULONG_MAX # define LONG_MAX (__GMP_ULONG_MAX / 2) # endif #endif #ifdef HAVE_STDINT #include #else /* size_t is an unsigned integer so this ought to work */ #ifndef SIZE_MAX #define SIZE_MAX (~((size_t) 0)) #endif #endif #define VERBOSE __ECM(verbose) static int VERBOSE = OUTPUT_NORMAL; void mpz_add_si (mpz_t r, mpz_t s, long i) { if (i >= 0) mpz_add_ui (r, s, (unsigned long) i); else mpz_sub_ui (r, s, (unsigned long) (-i)); } void mpz_sub_si (mpz_t r, mpz_t s, long i) { if (i >= 0) mpz_sub_ui (r, s, (unsigned long) i); else mpz_add_ui (r, s, (unsigned long) (-i)); } /* Divide RS by 3 */ void mpz_divby3_1op (mpz_t RS) { mp_size_t abssize = mpz_size (RS); if (abssize == 0) return; mpn_divexact_by3 (RS->_mp_d, RS->_mp_d, abssize); if (RS->_mp_d[abssize - 1] == 0) RS->_mp_size -= mpz_sgn (RS); } /* Convert a double d to a size_t. If d < 0., returns 0. If d > MAX_SIZE, returns MAX_SIZE. */ size_t double_to_size (double d) { if (d < 0.) return (size_t) 0; if (d > (double) SIZE_MAX) return SIZE_MAX; return (size_t) d; } /* cputime () gives the elapsed time in milliseconds */ #if defined (_WIN32) /* First case - GetProcessTimes () is the only known way of getting process * time (as opposed to calendar time) under mingw32 */ #include long cputime () { FILETIME lpCreationTime, lpExitTime, lpKernelTime, lpUserTime; ULARGE_INTEGER n; HANDLE hProcess = GetCurrentProcess(); GetProcessTimes (hProcess, &lpCreationTime, &lpExitTime, &lpKernelTime, &lpUserTime); /* copy FILETIME to a ULARGE_INTEGER as recommended by MSDN docs */ n.u.LowPart = lpUserTime.dwLowDateTime; n.u.HighPart = lpUserTime.dwHighDateTime; /* lpUserTime is in units of 100 ns. Return time in milliseconds */ return (long) (n.QuadPart / 10000); } #elif defined (HAVE_GETRUSAGE) /* Next case: getrusage () has higher resolution than clock () and so is preferred. */ #ifdef HAVE_SYS_TYPES_H # include #endif #ifdef HAVE_SYS_RESOURCE_H # include #endif long cputime () { struct rusage rus; getrusage (RUSAGE_SELF, &rus); /* This overflows a 32 bit signed int after 2147483s = 24.85 days */ return rus.ru_utime.tv_sec * 1000L + rus.ru_utime.tv_usec / 1000L; } #else /* Resort to clock (), which on some systems may return calendar time. */ long cputime () { /* Return time in milliseconds */ return (long) (clock () * (1000. / (double) CLOCKS_PER_SEC)); } #endif /* defining cputime () */ /* ellapsed time (in milliseconds) between st0 and st1 (values of cputime) */ long elltime (long st0, long st1) { if (st1 >= st0) return st1 - st0; else { /* A wrap around can only really happen on a system where long int is 32 bit and where we use clock(). So we assume that there was exactly one wrap-around which "swallowed" LONG_MAX * (1000. / (double) CLOCKS_PER_SEC) milliseconds. */ return st1 - st0 + (long)(LONG_MAX * (1000. / (double) CLOCKS_PER_SEC)); } } /* Get real (wall-clock) time in milliseconds */ long realtime () { #ifdef HAVE_GETTIMEOFDAY struct timeval tv; if (gettimeofday(&tv, NULL) != 0) return 0L; return (long) tv.tv_sec * 1000L + (long) tv.tv_usec / 1000L; #else return 0L; #endif } int get_verbose () { return VERBOSE; } /* Tests if loglevel gets printed with the current verbose setting */ int test_verbose (int loglevel) { return (loglevel <= VERBOSE); } void set_verbose (int v) { VERBOSE = v; } int inc_verbose () { VERBOSE ++; return VERBOSE; } int outputf (int loglevel, char *format, ...) { va_list ap; int n = 0; va_start (ap, format); MEMORY_TAG; /* For gmp_*printf's temp allocs */ if (loglevel != OUTPUT_ERROR && loglevel <= VERBOSE) { n = gmp_vfprintf (ECM_STDOUT, format, ap); fflush (ECM_STDOUT); } else if (loglevel == OUTPUT_ERROR) n = gmp_vfprintf (ECM_STDERR, format, ap); MEMORY_UNTAG; va_end (ap); return n; } void writechkfile (char *chkfilename, int method, double p, mpmod_t modulus, mpres_t A, mpres_t x, mpres_t z) { FILE *chkfile; char *methodname; mpz_t t; outputf (OUTPUT_DEVVERBOSE, "Writing checkpoint to %s at p = %.0f\n", chkfilename, p); switch (method) { case ECM_ECM : methodname = "ECM"; break; case ECM_PM1 : methodname = "P-1"; break; case ECM_PP1 : methodname = "P+1"; break; default: outputf (OUTPUT_ERROR, "writechkfile: Invalid method\n"); return; } chkfile = fopen (chkfilename, "w"); if (chkfile == NULL) { outputf (OUTPUT_ERROR, "Error opening checkpoint file %s\n", chkfilename); return; } mpz_init (t); gmp_fprintf (chkfile, "METHOD=%s; B1=%.0f; N=%Zd;", methodname, p, modulus->orig_modulus); mpres_get_z (t, x, modulus); gmp_fprintf (chkfile, " X=0x%Zx;", t); if (method == ECM_ECM) { mpres_get_z (t, z, modulus); gmp_fprintf (chkfile, " Z=0x%Zx;", t); mpres_get_z (t, A, modulus); gmp_fprintf (chkfile, " A=0x%Zx;", t); } fprintf (chkfile, "\n"); mpz_clear (t); fflush (chkfile); fclose (chkfile); } ecm-6.4.4/random.c0000644023561000001540000001131412106741273010642 00000000000000/* Random initialization for P-1 and P+1. Copyright 2005, 2006, 2008 Paul Zimmermann, Alexander Kruppa, Dave Newman. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #ifdef OUTSIDE_LIBECM # include "ecm-ecm.h" #else # include "ecm-impl.h" #endif #ifdef HAVE_UNISTD_H # include /* getpid */ #endif #ifdef TIME_WITH_SYS_TIME # include # include #else # if HAVE_SYS_TIME_H # include # else # include # endif #endif #if defined (_MSC_VER) || defined (__MINGW32__) # include # include #endif #if 0 /* dirty hack until outputf gets fixed */ #ifdef outputf # undef outputf # define outputf(x,y) printf(y) #endif #endif /* put in 'a' a valid random seed for P-1, i.e. gcd(a,n)=1 and a <> {-1,1} */ void pm1_random_seed (mpz_t a, mpz_t n, gmp_randstate_t randstate) { mpz_t q; mpz_init (q); do { mpz_urandomb (a, randstate, 32); mpz_gcd (q, a, n); } while (mpz_cmp_ui (q, 1) != 0 || mpz_cmp_ui (a, 1) == 0 || mpz_cmp_si (a, -1) == 0); mpz_clear (q); } /* put in seed a valid random seed for P+1 */ void pp1_random_seed (mpz_t seed, mpz_t n, gmp_randstate_t randstate) { mpz_t q; /* need gcd(p^2-4, n) = 1. */ mpz_init (q); do { mpz_urandomb (q, randstate, 32); mpz_add_ui (q, q, 1); mpz_set (seed, q); mpz_mul (q, q, q); mpz_sub_ui (q, q, 4); mpz_gcd (q, q, n); } while (mpz_cmp_ui (q, 1) != 0); mpz_clear (q); } /* Produces a random unsigned long value */ #if defined (_MSC_VER) || defined (__MINGW32__) unsigned long get_random_ul (void) { SYSTEMTIME tv; HCRYPTPROV Prov; if (CryptAcquireContext (&Prov, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { int r; unsigned long rnd; r = CryptGenRandom (Prov, sizeof (unsigned long), (void *) &rnd); CryptReleaseContext (Prov, 0); if (r) { /* warning: outputf is not exported from libecm */ #if !defined (OUTSIDE_LIBECM) && !defined(GPUECM) outputf (OUTPUT_DEVVERBOSE, "Got seed for RNG from CryptGenRandom\n"); #endif return rnd; } } /* warning: outputf is not exported from libecm */ #if !defined (OUTSIDE_LIBECM) && !defined(GPUECM) outputf (OUTPUT_DEVVERBOSE, "Got seed for RNG from GetSystemTime\n"); #endif GetSystemTime (&tv); /* This gets us 27 bits of somewhat "random" data based on the time clock. It would probably do the program justice if a better random mixing was done in the non-MinGW get_random_ul if /dev/random does not exist */ return ((tv.wHour<<22)+(tv.wMinute<<16)+(tv.wSecond<<10)+tv.wMilliseconds) ^ ((tv.wMilliseconds<<17)+(tv.wMinute<<11)+(tv.wHour<<6)+tv.wSecond); } #else unsigned long get_random_ul (void) { FILE *rndfd; struct timeval tv; unsigned long t; /* Try /dev/urandom */ rndfd = fopen ("/dev/urandom", "r"); if (rndfd != NULL) { if (fread (&t, sizeof (unsigned long), 1, rndfd) == 1) { /* warning: outputf is not exported from libecm */ #if !defined (OUTSIDE_LIBECM) && !defined(GPUECM) outputf (OUTPUT_DEVVERBOSE, "Got seed for RNG from /dev/urandom\n"); #endif fclose (rndfd); return t; } fclose (rndfd); } #ifdef HAVE_GETTIMEOFDAY if (gettimeofday (&tv, NULL) == 0) { /* warning: outputf is not exported from libecm */ #if !defined (OUTSIDE_LIBECM) && !defined(GPUECM) outputf (OUTPUT_DEVVERBOSE, "Got seed for RNG from gettimeofday()\n"); #endif return (unsigned long) tv.tv_sec + (unsigned long) tv.tv_usec * 2147483629UL; } #endif /* warning: outputf is not exported from libecm */ #if !defined (OUTSIDE_LIBECM) && !defined(GPUECM) outputf (OUTPUT_DEVVERBOSE, "Got seed for RNG from time()+getpid()\n"); #endif /* Multiply one value by a large prime to get a bit of avalance effect */ return (unsigned long) time (NULL) + (unsigned long) getpid () * 2147483629UL; } #endif ecm-6.4.4/TODO0000644023561000001540000001444412106741273007715 00000000000000ToDo's (see also TODO.sp): Table of contents: 1) efficiency/memory 2) interface 3) documentation 4) installation 5) bugs 6) others 1) efficiency/memory - use a random sigma value of 64 bits by default - try the mpn/generic/{sb,dc,mu}_bdiv_qr.c functions in GMP >= 4.3.0 for REDC - the conversion from NTT primes to mpz_t in function mpzspv_to_mpzv() (file mpzspv.c) is quadratic. A faster conversion is possible with a product tree (already done for the mpz_t -> NTT conversion). - even worse, mpzspm_init seems to be cubic in the input size (because the CRT algorithm used is quadratic in sp_num). We should use a subquadratic CRT. - the "Reducing G * H" step is faster in NTT than with KS. This is probably due to the fact that some transforms are cached in the NTT mode. - the "Reducing G * H" step can be improved as follows: first compute D = GH*I mod (x^d+1) where d = deg(F), and I = 1/F mod (x^d+1); then compute E = D*F mod (x^d-1); finally compute T = (GH-E)/2 mod (x^d+1). T equals the Montgomery product GH/(x^d+1) mod F. See the paper "Fast convolution meets Montgomery" by Preda Mihailescu (Mathematics of Computation). - slowdown in stage 1 with REDC between a 58672-digit number and a 58688-digit number [reported by Christophe.CLAVIER@gemalto.com, 29 Aug 2007] (((2003663613*2^195000-2)/(2*23*173*3863))/1954173900202379)/3612632846010637 ((2003663613*2^195000-2)/(2*23*173*3863))/1954173900202379 with B1=1000 on an Opteron (44.2s for c58672, 67.5s for c58688). The culprit seems to be the REDC routine in mpmod.c: indeed, in case the modulus has n limbs, but the most significant one has only a few bits, the product (called x in REDC) has only 2n-1 limbs, and we never call Mulders's short product in ecm_redc_n (however the else-code using full products seem faster in that case). For c58672, if one replaces if (xn == 2 * n) in mpmod.c/REDC by if (xn >= 2 * n - 1), the time of stage 1 grows from 44s to 64s, whereas ecm_redc_n should be faster... This problem is still present in 6.2, ecm_redc_n should be better tuned, in particular the choice k=0.75*n in ecm_mul_lo_n() is far from optimal. - in Brent-Suyama's extension, the evaluation of a polynomial of degree k over N consecutive values is currently done using a O(k N) algorithm [table of differences]. One can do O(N/k M(k)), cf Section 3 from "Linear recurrences with polynomial coefficients and application to integer factorization and Cartier-Manin operator", by Alin Bostan, Pierrick Gaudry and Eric Schost, SIAM journal on computing, vol. 36, no. 6, pp. 1777 - 1806, 2007. It is not clear if this result also applies to ECM, but at least it should word for P-1 and P+1. - why restrict the use of mpn_mul_fft to Fermat numbers? We could use it for any cofactor of 2^(n*BITS_PER_MP_LIMB)+1, as long as mpn_fft_next_size (n, mpn_fft_best_k (n, S1 == S2)) == n. - use mpres in step 2 (Target: 7.0) - write a mpn version of add3 and duplicate - rewrite entire mpmod.c to be based on mpn_* functions, not mpz_* - take relative speed of multiplying/squaring into account in PRAC (DN: couldn't get any significant speed increase) - use/implement a mpn_mul_hi_n routine for use in mpn_REDC - use mpn_addmul_2, mpn_addmul_4 in the basecase REDC [for machines where it exists]. ASM code should perhaps be moved into GMP. - try McLaughlin's algorithm for Montgomery's modular multiplication (http://www.ams.org/mcom/0000-000-00/S0025-5718-03-01543-6/home.html) - consider Colin Percival's generalized DWT for multiplication modulo k*a^n+b, where k*a*b is highly composite. May belong to GMP rather than GMP-ECM. - implement assembly code (redc.asm) for other architectures - allow composite d2, or better use the S1+S2 idea from the P+-1 algorithm of Montgomery and Kruppa. - init mpz_t's with correct amount of memory allocated to avoid reallocs. Check for reallocs with GMP's memory interface routines. (Partly done.) - try sliding window multiplication for ECM stage 1 (Target: 7.0) - choose Brent/Suyama polynomial according to B2/k and not B2! - Adjust estimated memory to take into account -treefile and NTT (done but improvement possible) - when GWNUM is used, lower the default B2 (James Wanless, 17 Mar 2006, james at grok.ltd.uk) - implement enhanced standard continuation? With graph cover algorithm? - parallel/distributed stage 2? - add curve selection for torsion group of order 8 or 16, see Montgomery's thesis (request of Peter Lawrence Montgomery) - Torbj"orn Granlund suggested faster code for mpn_mod_1(), used extensively in NTT. See http://lists.gforge.inria.fr/pipermail/ecm-discuss/2008-May/003365.html 2) interface - from Mark Rodenkirch 08 April 2011: print messages like "Step 1: 1500000/100000000" with a command-line option (or with -v) http://lists.gforge.inria.fr/pipermail/ecm-discuss/2011-April/004088.html - with -resume, print %time for THIS RUN instead of total run? [suggested by SleepHound ] Add CPUTIME=... in the save file, to take into account the total cpu time spend so far (in seconds). George Woltman agrees for that change. It won't hurt prime95/mprime -> will be added for his next version. - when resuming, print the *initial* x0 for P-1/P+1? - [from Jakub Pawlewicz ] add an option -stage1time t to tell the step 1 time, when done by another program. PZ: or better have it in resume file? (Target: 6.1. Command line option done) 3) documentation 4) installation - check for __builtin_constant_p and __builtin_expect at configure time - [suggested by Peter Montgomery] add the possibility to compile a "fat" binary, which automatically selects the best mulredc assembly code depending on the cpuid [see TODO.fat] - [suggested by Thomas Kunz, who did port GMP-ECM to the PS3, i.e., to the Cell architecture]: several changes to make it easier to port GMP-ECM to specific architectures. Cf TODO.kunz. 5) bugs 6) others - add primality proving of factors/cofactors? Maybe link Pari for this? - add point counting algorithm? SEA implementation exists for Pari/GP, use that? - let user specify previous factoring work, compute distribution of candidate factors, compute probability of/est. time to finding a factor with given parameters. - re-write in C++? Lots of work, but would make parts of the code much cleaner. ecm-6.4.4/README0000644023561000001540000010357612106741273010112 00000000000000This is the README file for GMP-ECM. (See INSTALL-ecm for installing GMP-ECM and the ecm library, and README.lib for using the ecm library.) Table of contents of this file: 1. Basic usage. 2. How to use P-1, P+1, and ECM efficiently? 3. Extra factors and Brent-Suyama's extension. 4. Memory usage. 5. Expression syntax reference for GMP-ECM's syntax parser. 6. Options -save, -resume and -chkpnt. 7. Working with very large numbers (the -prp* options). 8. How to get the best of GMP-ECM? 9. Record factors. 10. Known problems. ############################################################################## 1. Basic usage GMP-ECM reads the numbers to be factored from stdin (one number on each line) and requires a numerical parameter, the stage 1 bound B1. A reasonable stage 2 bound B2 for the given B1 is chosen by default, but can be overridden by a second numerical parameter. By default, GMP-ECM uses the ECM factoring algorithm. Example: To run one curve of ECM with B1=1000000 on each number in the file "composites", run ecm 1000000 < composites To use a B2 value of ~5*10^8 instead of the default value of ~10^9, run ecm 1000000 5e8 < composites Scientific notation is accepted for B1 and B2 values. The actual B2 value used may be larger than the specified value to let parameters satisfy some conditions imposed by the stage 2 algorithm. To run one curve with B1=11e7 on M1061, simply do: echo "2^1061-1" | ecm 11e7 To run more than one ECM curve on each input number, use the -c parameter. Example: to run 100 curves with B1=1000000 and default B2 on each number in "composites", run ecm -c 100 1000000 < composites To use the P-1 or P+1 factoring methods, use the -pm1 or -pp1 parameter, respectively. Example: to use the P-1 method with B1=10^9 on all numbers in the file "composites", run ecm -pm1 1e9 < composites Note that, unlike for ECM, using the same B1,B2 bounds on one number is quite useless for P-1, and of limited use for P+1. See "2. How to use P-1, P+1, and ECM efficiently?" ############################################################################## 2. How to use P-1, P+1, and ECM efficiently? The P-1 method works well when the input number has a prime factor P such that P-1 is "smooth", i.e., has all its prime factor less or equal the step 1 bound B1, except one which may be less or equal the second step bound B2. For P=67872792749091946529, we have P-1 = 2^5 * 11 * 17 * 19 * 43 * 149 * 8467 * 11004397, so this factor will be found as long as B1 >= 8467 and B2 >= 11004397: $ echo 67872792749091946529 | ./ecm -pm1 -x0 2809890345 8467 11004397 GMP-ECM ... [powered by GMP ...] [P-1] Input number is 67872792749091946529 (20 digits) Using B1=8467, B2=6710-19370830, x0=2809890345 Step 1 took 3ms Step 2 took 14ms ********** Factor found in step 2: 67872792749091946529 Found input number N There is no need to run P-1 several times with the same B1 and B2 as there is for ECM, since a factor found with one seed will (almost always) be found by another one. The P+1 method works well when the input number has a prime factor P such that P+1 is "smooth". For P=4190453151940208656715582382315221647, we have P+1 = 2^4 * 283 * 2423 * 21881 * 39839 * 1414261 * 2337233 * 132554351, so this factor will be found as long as B1 >= 2337233 and B2 >= 132554351: $ echo 4190453151940208656715582382315221647 | ./ecm -pp1 -x0 7 2337233 132554351 GMP-ECM ... [powered by GMP ...] [P+1] Input number is 4190453151940208656715582382315221647 (37 digits) Using B1=2337233, B2=2324738-343958122, x0=7 Step 1 took 750ms Step 2 took 120ms ********** Factor found in step 2: 4190453151940208656715582382315221647 Found input number N However not all seeds will succeed: only half of the seeds 'x0' work for P+1 (namely those where the Jacobi symbol of x0^2-4 and P is -1.) Unfortunately, since P is usually not known in advance, there is no way to ensure that this holds. However, if the seed is chosen randomly, there is a probability of about 1/2 that it will give a Jacobi symbol of -1 (i.e., the factor P will be found if P+1 is smooth enough). A rule of thumb is to run 3 times P+1 with different random seeds. The seeds 2/7 and 6/5 have a slightly higher chance of success than average as they lead to a group order divisible by 6 or 4, respectively. When factoring Fibonacci numbers F_n or Lucas numbers L_n, using the seed 23/11 ensures that the group order is divisible by 2n, making other P+1 (and probably P-1) work unnecessary. As of version 6.2, a new stage 2 for the P-1 and P+1 algorithms is implemented. It uses less memory and is faster than the previous code, thus allowing larger B2 values. If GMP-ECM is configured with the "--enable-openmp" flag and is compiled with a compiler that implements OpenMP, it uses multi-threading for computation of polynomial roots and NTT multiplication. When not using the NTT, it benefits from multi-threading only in the computation of roots phase. The number of threads to use can be controlled with the OMP_NUM_THREADS environment variable. Unlike the previous generic stage 2, the new stage 2 cannot use the Brent-Suyama extension (-power and -dickson parameters). Specifying these options on the command line forces use of the generic stage 2. Note: the notation of the parameters follows that in the paper, the number of multi-point evaluations (similar to "blocks") is given by s_2. You can specify a lower limit for s_2 by the -k command line parameter. The ECM method is a probabilistic method, and can be viewed in some sense as a generalization of the P-1 and P+1 method, where we only require that P+t+1 is smooth, where t depends on the curve we use and satisfies |t| <= 2*P^(1/2) (Hasse's theorem). The optimal B1 and B2 bounds have to be chosen according to the (usually unknown) size of P. The following table gives a set of nearly optimal B1 and B2 pairs, with the corresponding expected number of curves to find a factor of given size (column "-power 1" does not take into account the extra factors found by Brent-Suyama's exten- sion, whereas column "default poly" takes them into account, with the poly- nomial used by default: D(n) means Dickson's polynomial of degree n): digits D optimal B1 default B2 expected curves N(B1,B2,D) -power 1 default poly 20 11e3 1.9e6 74 74 [x^1] 25 5e4 1.3e7 221 214 [x^2] 30 25e4 1.3e8 453 430 [D(3)] 35 1e6 1.0e9 984 904 [D(6)] 40 3e6 5.7e9 2541 2350 [D(6)] 45 11e6 3.5e10 4949 4480 [D(12)] 50 43e6 2.4e11 8266 7553 [D(12)] 55 11e7 7.8e11 20158 17769 [D(30)] 60 26e7 3.2e12 47173 42017 [D(30)] 65 85e7 1.6e13 77666 69408 [D(30)] Table 1: optimal B1 and expected number of curves to find a factor of D digits with GMP-ECM. After performing the expected number of curves from Table 1, the probability that a factor of D digits was missed is exp(-1), i.e., about 37%. After twice the expected number of curves, it is exp(-2), i.e., about 14%, and so on. Example: after performing 8266 curves with B1=43e6 and B2=2.4e11 (or 7553 curves with -dickson 12), the probability to miss a 50-digit factor is about 37%. From version 6.0 on, GMP-ECM prints the expected number of curves and expected time to find factors of different sizes in verbose mode (option -v). This makes it easy to further optimize parameters for a certain factor size if desired: simply try to minimize the expected time. (lengthy NOTE: The order of an elliptic curve with Montgomery parameteriza- tion, as used by GMP-ECM, is known to be divisible by 12. Therefore one can assume that the probability that the order is B1,B2 smooth should be about as great as for a random integer 1/12th in value. However, Montgomery observed that the order behaves even nicer than that: heuristically, it seems that the order is as likely to be smooth as a random integer about 1/23.4 in value. This is the value we use in GMP-ECM and the computed probabilities match those observed in experiments very well. This however means that the so computed values for the expected number of curves for given B1,B2 values and factor sizes do not match those published in the literature where a factor of only 1/12 was used. The factor GMP-ECM uses is defined as ECM_EXTRA_SMOOTHNESS in rho.c, you can change it to 12.0 if you want to reproduce the more pessimistic values found in the literature.) In summary, we advise the following method: 0 - choose a target factor size of D digits 1 - choose optimal B1 and B2 values to find factors of D digits (cf Table 1) 2 - run once P-1 with 10*B1, and the default B2 chosen by GMP-ECM 3 - (optional) run 3 times P+1 with 5*B1, and the default B2 4 - run N(B1,B2,D) times ECM with those B1 and B2, where N(B1,B2,D) is the expected number of ECM curves with step 1 bound B1, step 2 bound B2, to find a factor of D digits (cf above table). 5 - if no factor is found, either increase D by 5 digits and go to 0, or use another factorization method (MPQS, NFS) Note: if a factor is found in steps 2, 3 or 4, simply continue the current step with the remaining cofactor (if composite). There is no need to start again from 0, since the cofactor was already tested, too. ############################################################################## 3. Extra factors and Brent-Suyama's extension. GMP-ECM may sometimes find some "extra" factors, such that one factor of P-1, P+1 or P+t+1 exceeds the step 2 bound B2, thanks to Brent-Suyama's extension. Let's explain how it works for P-1, since it's simpler. The classical step 2 (without Brent-Suyama's extension) considers s^(j*d) mod N and s^i mod N, where N is the number to factor, and s is the residue computed in stage 1. Here, d is fixed, and the integers i and j vary in two sets so that j*d-i covers all primes in [B1, B2]. Now consider a polynomial f(x), and compute s^f(j*d) and s^f(i) instead of s^(j*d) and s^i [thus the classical step 2 corresponds to f(x)=x^1]. Then P will be found whenever all but one of the factors of P-1 are <= B1, and one factor divides some f(j*d) - f(i): $ echo 1207946164033269799036708918081 | ./ecm -pm1 -k 3 -power 12 286493 25e6 GMP-ECM ... [powered by GMP ...] [P-1] Input number is 1207946164033269799036708918081 (31 digits) Using B1=286493, B2=30806172, polynomial x^12, x0=1548711558 Step 1 took 320ms Step 2 took 564ms ********** Factor found in step 2: 1207946164033269799036708918081 Found input number N Here the largest factor of P-1 is 83957197, which is 3.35 times larger than B2. Warning: these "extra" factorizations may not be reproducible from one version of GMP-ECM to another one, since they depend on some internal parameters that may change. For P-1 with the generic stage 2, the degree of the Brent-Suyama polynomial should be even. Since i^2k - (j*d)^2k = (i^k - (j*d)^k)(i^k + (j*d)^k), this allows testing two values symmetric around a multiple of d simultaneously, halving the amount of computation required in stage 2. P+1 with the generic stage 2 and ECM do this inherently. The new fast stage 2 for P-1 and P+1 does not support the Brent-Suyama extension. By default, the fast stage 2 is used for P-1 and P+1; giving a -power or -dickson parameter on the command line forces use of the previous, generic stage 2. It is recommended to use the new stage 2 (from version 6.2) for P-1 and P+1, which is the default: it is so much faster that it largely compensates the few extra factors that are not found because Brent-Suyama's extension is not available. The default polynomial used for ECM with a given B2 should be near optimal, i.e., give only a marginal overhead in step 2, while enabling extra factors. ############################################################################## 4. Memory usage. Step 1 does not require much memory: O(n) for an input number of n digits. Step 2 may be quite memory expensive, especially for large B2, since its efficient algorithms use some large tables. To reduce the memory usage of step 2, you may increase the 'k' parameter, which controls the number of "blocks" performed in step 2. Multiplying the default value of k by 4 will decrease the memory usage by a factor of 2. For example with B2=1e10 and a 155-digit number, step 2 requires about 55MB with the default k=4, but only 27MB with k=16. Increasing k does, however, slightly increase the time required for step 2 (see section "How to get the best of GMP-ECM?"). An estimation of the memory usage is given at the start of stage 2: $ ecm -v -k 4 10 1e10 < c155 ... Estimated memory usage: 55M ... Step 2 took 18649ms $ ecm -v -k 16 10 1e10 < c155 ... Estimated memory usage: 27M ... Step 2 took 26972ms Another way is to use the -treefile parameter, which causes some of the tables to be stored on disk instead of in memory. Using the option "-treefile /var/tmp/ecmtree" will create the files "/var/tmp/ecmtree.1", "/var/tmp/ecmtree.2" etc. The files are deleted upon completion of stage 2: $ ecm -v -treefile /tmp/ecmtree -k 4 10 1e10 < c155 ... Estimated memory usage: 36M ... Step 2 took 18648ms Due to time consuming disk I/O, this will cause stage 2 to take somewhat longer. How much memory is saved depends on stage 2 parameters, but a typical value is that memory use is reduced by a factor of about 1.5. Increasing the number of blocks with -k also reduces the amount of data that needs to get written to disk, thus reducing disk I/O time. Combining these parameters is a very effective way of reducing memory use. Up from version 6.1, there is still another (better) possibility, with the -maxmem option. The command-line -maxmem nnn option tells GMP-ECM to use at most nnn MB in stage 2. It is better than -k because it takes into account the size of the number to be factored, and automatically adjusts the number of blocks to use: $ ./ecm -v -maxmem 40 10 1e10 < c155 ... dF=8192, k=15, d=79170, d2=11, i0=-10 ... Estimated memory usage: 27M ... Step 2 took 25456ms NOTE that in -b "breadth-first" mode, GMP-ECM reads all candidate numbers in the input stream and keeps them in memory, so if there are many large numbers to be tested, the memory requirement will increase noticeably. ############################################################################## 5. Expression syntax reference for GMP-ECM's syntax parser. GMP-ECM can handle several kinds of expressions as input numbers. Here is the syntax that is handled: 1. Raw decimal numbers like 123456789 2. Comments can be placed in the file. The C++ "one line comment" // is used. Everything after the // on a line (including the //) is ignored. Warning: no input number should appear on such a comment line. 3. Line continuation. If a line ends with a backslash character '\', it is considered it continues on the next line (ignoring the '\'). 4. Any white space (space, tab, end of line) is ignored. However, the "end of line" is used to end the expression (unless of course there is a '\' character before the end of line). For example, processing this: 1 2 3 4 5 6 7 8 9 would be the same as processing 123456789 5. "common" arithmetic expressions (* / + - %), the period '.' might be used in place of * for multiply, and - can be unary minus (e.g., -55555551). Example: echo "3*5+2" | ./ecm 100 6. Grouping ( [ { for start of group (which symbol is used does not matter) and ) ] } to end a group (again all 3 symbols mean the SAME thing). 7. Exponentiation with the ^ character (i.e., 2^24 is the same as 16777216). Example: echo "2^24+1" | ./ecm 100 8. Simple factorial using the exclamation point ! character. Example is 53! == 1*2*3*4...*52*53. Example: echo '53!+1' | ./ecm 1e2 9. Multi-factorial as in: n!m with an example: 15!3 == 15.12.9.6.3. 10. Simple Primorial using the # character with example of 11# == 2*3*5*7*11 11. Reduced Primorial n#m with example of 17#5 == 5.7.11.13.17 12. Functions are possible with the expression parser. Currently, the only available function is Phi(x,n), however other functions should be easy to add in the future. Note: Expressions are maintained as much as possible (even if the expression becomes longer than the decimal expansion). Expressions are output as cofactors (if the input was an expression), and are stored into save/resume files (again if and only if the original input was an expression, and not an expanded decimal number). When a factor is found, the cofactor expression is of the form (original_expression)/factor_found (see however option -cofdec): $ echo "3*2^210+1" | ./ecm -sigma 4007218240 2500 GMP-ECM ... [powered by GMP ...] [ECM] Input number is 3*2^210+1 (64 digits) Using B1=2500, B2=186156, polynomial x^1, sigma=4007218240 Step 1 took 16ms Step 2 took 16ms ********** Factor found in step 2: 1358437 Found probable prime factor of 7 digits: 1358437 Probable prime cofactor (3*2^210+1)/1358437 has 58 digits ############################################################################## 6. Options -save, -resume and -chkpnt. These -save and -resume options are useful to save the current state of the computation after step 1, or to exchange data with other software. It allows to perform step 1 with GMP-ECM, and step 2 with another software (or vice-versa). Note: the residue from the end of stage 1 gets written to the file only after stage 2, if stage 2 is performed in the same program run. This way, if a factor is found, the save file entry will contain the new cofactor (if composite) or will be omitted (if cofactor is a probable prime). For periodic saving during stage 1 for crash recovery, use -chkpnt, described below. Here is an example how to reuse some P-1 computation: $ cat c71 13155161912808540373988986448257115022677318870175067553764004308210487 $ ./ecm -save toto -pm1 -mpzmod -x0 2 5000000 < c71 GMP-ECM ... [powered by GMP ...] [P-1] Input number is 13155161912808540373988986448257115022677318870175067553764004308210487 (71 digits) Using B1=5000000, B2=352526802, polynomial x^24, x0=2 Step 1 took 3116ms Step 2 took 2316ms The file "toto" now contains some information about the method used, the step 1 bound, the number to factor, the value X at the end of step 1 (in hexa- decimal), and a checksum to verify that no data was corrupted: $ cat toto METHOD=P-1; B1=5000000; N=13155161912808540373988986448257115022677318870175067553764004308210487; X=0x12530157ae22ae14d54d6a5bc404ae9458e54032c1bb2ab269837d1519f; CHECKSUM=2287710189; PROGRAM=GMP-ECM 6.2; X0=0x2; WHO=zimmerma@clafoutis.loria.fr; TIME=Sat Apr 12 13:41:01 2008; Then one can resume the computation with larger B1 and/or B2 as follows: $ ./ecm -resume toto 1e7 GMP-ECM ... [powered by GMP ...] [ECM] Resuming P-1 residue saved by zimmerma@clafoutis.loria.fr with GMP-ECM 6.2 on Sat Apr 12 13:41:01 2008 Input number is 13155161912808540373988986448257115022677318870175067553764004308210487 (71 digits) Using B1=5000000-10000000, B2=9946848-1326917772, Step 1 took 3076ms Step 2 took 4304ms ********** Factor found in step 2: 1448595612076564044790098185437 Found probable prime factor of 31 digits: 1448595612076564044790098185437 Probable prime cofactor 9081321110693270343633073697474256143651 has 40 digits The second run only considered the primes in [5e6-10e6] in step 1, which saved half the time of step 1. The format used is the following: - each line corresponds to a composite (expression ARE saved in the save file) - a line contains assignments = separated by semi-colons ';' - possible values for are - METHOD (value = ECM or P-1 or P+1) - SIGMA (value = ECM sigma parameter) [ECM only] - B1 (first step bound) - N (composite number to factor) - X (value at the end of step 1) - A (A-parameter of the elliptic curve) [ECM only] - CHECKSUM (internal value to check correctness of the format) - PROGRAM (program used to perform step 1, useful for factor credits) - X0 (initial point for ECM, or initial residue for P-1/P+1) [optional] - WHO (who performed step 1) - TIME (date and time of first step) SIGMA and X0 would be optional, and would be mainly be used in case of a factor is found, to be able to reproduce the factorization. For ECM, one of the SIGMA or A values must be present, so that the computation can be continued on the correct curve. The B1 and X values satisfy the condition that X is a lcm(1,2,...,B1)-th power in the (multiplicatively written) group. If consecutive lines in a save file being resumed contain the same number to be factored, say when many ECM curves on one number have been saved, factors discovered by GMP-ECM are carried over from one attempt to the next so that factors will be reported only once. If the cofactor is a probable prime, or if the -one option was given and a factor was found, the remaining consecutive lines for that number will be skipped. Note: it is allowed to have both -save f1 and -resume f2 for the same run, however the files f1 and f2 should be different. Remark: you should not perform in parallel several -resume runs on the same input with the same B1/B2 values, since those runs will do the same computations. Options -save/-resume are useful in the following cases: (a) somebody did a previous step 1 computation with another software which is faster than GMP-ECM, and wants to perform Step 2 with GMP-ECM which is faster for that task. (b) somebody did a previous step 1 for P-1 or P+1 up to a given bound B1, and you want to extend that computation with B1' > B1, without restarting from scratch. Note: this does not apply to ECM, where the smoothness property depends on the (random) curve chosen, not only on the input number. (c) you did a huge step 1 P-1 or P+1 computation on a given machine, and you want to perform a huge step 2 in parallel on several machines. For example machine 1 tests the range B2_1-B2_2, machine 2 tests B2_2-B2_3, ... This also decreases the memory usage for each machine, which is function of the range width B2max-B2min. For the same reason as (b), this does not apply to ECM. The -chkpnt option causes GMP-ECM to write the current residue periodically during the stage 1 computation. This is useful as a safeguard in case the GMP-ECM process is terminated, or the computer loses power, etc. The checkpoint is written every ten minutes, when a signal (SIGINT, SIGTERM) is received, and at the end of stage 1. The format of the checkpoint file is very similar to that of regular save files, and checkpoints can be resumed with the -resume option. For example: $ ecm -chkpnt pm1chkpoint -pm1 1e10 1 < largenumber.txt [Computer crashes during computation] $ ecm -resume pm1chkpoint 1e10 1 Note: if an existing file is specified as the checkpoint file, it will be silently overwritten! Note 2: When resuming a checkpoint file, additional small primes may be processed in stage 1 when the checkpoint file is resumed, so the end-of-stage 1 residues of an uninterrupted run and a checkpointed run may not match. The extra primes do not reduce the probability of finding factors, however. ############################################################################## 7. Executing shell commands You can tell GMP-ECM to execute shell commands when a factor is found or to run an external program for PRP testing. This feature is not compiled in by default, it must be enabled by the parameter --enable-shellcmd when running "configure". If you specify -faccmd on the commandline, will be executed whenever a factor is found by P-1, P+1 or ECM (not by trial divison). The original number, the factor found and the cofactor are passed to via stdin, each number on a line. You may use this for example to have factors automatically sent to you by email: ecm -faccmd 'mail -s "$HOSTNAME found a factor" myemail@myplace.org' \ -c 900 1e6 < candidates.txt The parameter -prpcmd lets you specify a program to perform a probable primality test instead of the GMP built-in function. The number to test is passed on one line to via stdin. The result of the test is expected as the exit code of , where exit code 0 (true) means "is probably prime" and a non-zero code (false) means "is composite". Example: ecm -prpcmd "pfgw --" -c 900 1e6 . The parameter -idlecmd will make GMP-ECM run before each ECM, P-1 or P+1 attempt on a number. If the exit status of is non-zero, GMP-ECM terminates immediately, otherwise it continues normally. GMP-ECM resumes only after has terminated, so this is a way for letting GMP-ECM sleep while the system is busy - just let sleep until the system is idle again. ############################################################################## 8. How to get the best of GMP-ECM? [this part has to be rewritten, beginning of new part] After configuring GMP-ECM, type "make bench_mulredc" (or "make bench_mulredc.exe" under Windows) and type: $ ./bench_mulredc # or bench_mulredc.exe under Windows This will output at the end two lines as follows: /* 0:mulredc 1:mul+redc_1 2:mul+redc_2 3:mul+redc_n */ #define TUNE_MULREDC_TABLE {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* 0:mulredc 1:sqr+redc_1 2:sqr+redc_2 3:sqr+redc_n */ #define TUNE_SQRREDC_TABLE {0,0,0,0,0,0,0,0,1,1,3,1,1,3,1,1,1,1,2,1,2} Then put those lines in the ecm-params.h file (which is a symbolic link), or replace the corresponding values in that file if already present, and recompile GMP-ECM. [end of new part] Choice of modular multiplication. The ecm program may choose between 4 kinds of modular arithmetic: (1) Montgomery's REDC algorithm at the word level (option -modmuln). It is quite fast for small numbers, but has quadratic asymptotic complexity. (2) classical GMP arithmetic (option -mpzmod). Has some overhead with respect to (1) for small sizes, but wins over (1) for larger sizes since it has quasi-linear asymptotic complexity. (3) Montgomery's REDC algorithm at high level (option -redc). This essentially replaces each division by two multiplications. Slower than (1) and (2) for small inputs, but better for large or very large inputs. (4) base-2 arithmetic for numbers dividing 2^n+1 or 2^n-1. Each division has only linear time, but the multiplication are more expensive since they are done on larger numbers. (5) If you have a 64 bit processor, use it. Both GMP and the NTT code in GMP-ECM perform MUCH better with 64 bit arithmetic than with 32 bits. This of course requires that you use a 64 bit OS. Many Linux and BSD distributions let you choose between 32 bit and 64 bit at installation. The ecm program automatically selects what it thinks is the best arithmetic for the given input number. If that choice is not optimal, you may force the use of a certain arithmetic by trying options -modmulm, -mpzmod, -redc. (The best choice should depend on B1 and B2 only very little, so long as B1 is not too small, say >= 1000.) Number of step 2 blocks. The step 2 range [B1,B2] is divided into k "big blocks". The default value of k is chosen to be near to optimal. However, it may be that for a given (B1,B2) pair, another value of k may be better. Try for this to give the option -k to ecm, where is 1, 2, 3, ... This will force ecm to divide step 2 in at least blocks. Changing the value of the number of blocks will not modify the chance of finding a factor (except for extra factors, but some will be lost, and some will be won, so the balance should be nearly even). However it will change the time spent in Step 2 and modify the memory used by Step 2 (see the section "Memory usage"). Optimal thresholds. The thresholds for the algorithms used in ecm are defined in ecm-params.h. Several ecm-params.h.* files are included in the distribution and the configure script will select one matching your machine if it exists. If there is no ecm-params.h.* for your machine then you can either compile with default values (not recommended) or you can generate ecm-params.h first with "make ecm-params; make". Stage 2 now uses Number-Theoretic Transforms (NTT) for polynomial arithmetic by default for numbers of at most 30 machine words (NTT_SIZE_THRESHOLD in ecm-ecm.h). The NTT code forces dF to be a power of 2; it can be disabled by passing the command-line option -no-ntt and unconditionally enabled by -ntt. Performance of NTT is dependent on: - Architecture. NTT seems to give the greatest improvement on Athlons, and the least improvement on Pentiums without SSE2. - Thresholds. It is vital to have ecm-params.h properly tuned for your machine. - C compiler. The SSE2 assembly code for 32 bit and the assembly code for 64 bit only work for x86 using gcc or Intel cc, so it is compiler dependent. Note on factoring Fermat numbers: GMP-ECM features Schönhage-Strassen multiplication for polynomials in stage 2 when factoring Fermat numbers (not in the new, fast stage 2 for P+1 and P-1. This is to be implemented.) This greatly reduces the number of modular multiplications required, thus improving speed. It does, however, restrict the length of the polynomials to powers of two, so that for a given number of blocks (-k parameter), the B2 value can only increase by factors of approximately 4. For the number of blocks, choices of 2, 3 or 4 usually give best performance. However, if the polynomial degree becomes too large, relatively expensive Karatsuba or Toom-Coom methods are required to split the polynomial before Schönhage-Strassen's method can handle them. That can make a larger number of blocks worthwhile. When factoring the m-th Fermat number F_m = 2^(2^m)+1, degrees up to dF=2^(m+1) can be handled directly. If your B2 choice requires a degree much larger than this (dF is printed with the -v parameter), try increasing the number of blocks with -k and see if performance improves. The Brent-Suyama extension should not be used when factoring Fermat numbers, it is more efficient to simply increase B2. Therefore, -power 1 for P+1 and ECM, and -power 2 for P-1 are the default for Fermat numbers. (Larger degrees for Brent-Suyama may possibly become worthwhile for P-1 runs on smaller Fermat numbers and extremely large B2, when Karatsuba and Toom-Cook are used extensively.) Factoring Fermat numbers uses a lot of memory, depending on the size of the Fermat number and on dF. For dF=65536 and F_12, the memory used is about 1700MB. If your system does not have enough memory, you will have to use a larger number of blocks to reach the desired B2 value with a smaller poly degree dF, which sacrifices some performance. Additionally, you may use the -treefile option (see 4. Memory usage) k=1 k=2 k=3 k=4 dF=256 582132 1222002 1864182 2504052 dF=512 2443992 5008092 7572192 10131672 dF=1024 10016172 20263332 30519732 40766892 dF=2048 42634420 85689250 128744080 171798910 dF=4096 173259252 347500242 521780502 696021492 dF=8192 711738310 1425139180 2138540050 2851940920 dF=16384 2850278350 5703881830 8557643650 11411247130 dF=32768 11702792020 23412731170 35122670320 46832609470 dF=65536 48071333326 96165459406 144259585486 192353711566 dF=131072 194020810630 388069884940 582118959250 776168033560 Table 2: Stage 2 interval length B2-B2min, for dF a power of 2 and small values of k. For example, if you'd like to run stage 2 on F_12 with B2 ~= 40G, try parameters "-k 1 48e9", "-k 3 35e9" or "-k 4 46e9". ############################################################################## 9. Record factors. If you find a very large factor, the program will print a message like: Report your potential champion to (see ) This means that your factor might be a champion, i.e., one of the top-ten largest factors ever found by the corresponding method (P-1, P+1 or ECM). Cf the following URLs: ECM: http://wwwmaths.anu.edu.au/~brent/ftp/champs.txt P-1: http://www.loria.fr/~zimmerma/records/Pminus1.html P+1: http://www.loria.fr/~zimmerma/records/Pplus1.html ############################################################################## 10. Known problems. On some machines, GMP-ECM uses the clock() function to measure the cpu time used by step 1 and 2. Since that function returns a 32-bit integer, there is a possible wrap-around effect when the clock() value goes back from 2^32-1 to 0, which may produce negative timings. The NTT code uses primes that fit in one machine word and that are congruent to 1 (mod l), where l is the largest transform length required for the desired stage 2 parameters. For very large B2 on 32-bit machines, there may not be enough suitable primes, which may limit the possible transform length to less than what available memory would permit. This problem occurs mostly in the fast stage 2 for P-1 and P+1, as the generic stage 2 uses far more memory for a given polynomial degree, so that memory on a 32-bit machine will be exhausted long before suitable NTT primes are. The maximal transform length depends on the size of the input number. For a transform length l on a 32 bit machine, N must satisfy: l=2^11:N<2^756200, l=2^12:N<2^379353, l=2^13:N<2^190044, l=2^14:N<2^94870, l=2^15:N<2^47414, l=2^16:N<2^23322, l=2^17:N<2^11620, l=2^18:N<2^5891, l=2^19:N<2^2910, l=2^20:N<2^1340, l=2^21:N<2^578, l=2^22:N<2^228. Since log(N)*l is approximately constant, this limits the amount of memory that can be used to about 600MB for P-1, and 1200MB for P+1. ecm-6.4.4/factor.c0000644023561000001540000001003412106741273010636 00000000000000/* factor.c - public interface for libecm. Copyright 2005, 2006, 2007, 2009, 2011 Paul Zimmermann, Alexander Kruppa, David Cleaver, Cyril Bouvier. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include "ecm-impl.h" void ecm_init (ecm_params q) { q->method = ECM_ECM; /* default method */ MEMORY_TAG; mpz_init_set_ui (q->x, 0); mpz_init_set_ui (q->sigma, 0); q->sigma_is_A = 0; mpz_init_set_ui (q->go, 1); q->B1done = ECM_DEFAULT_B1_DONE + 1. / 1048576.; mpz_init_set_si (q->B2min, -1.0); /* default: B2min will be set to B1 */ mpz_init_set_si (q->B2, ECM_DEFAULT_B2); q->k = ECM_DEFAULT_K; q->S = ECM_DEFAULT_S; /* automatic choice of polynomial */ q->repr = ECM_MOD_DEFAULT; /* automatic choice of representation */ q->nobase2step2 = 0; /* continue special base 2 code in ecm step 2, if used */ q->verbose = 0; /* no output (default in library mode) */ q->os = stdout; /* standard output */ q->es = stderr; /* error output */ q->chkfilename = NULL; q->TreeFilename = NULL; q->maxmem = 0.0; q->stage1time = 0.0; MEMORY_TAG; gmp_randinit_default (q->rng); MEMORY_TAG; gmp_randseed_ui (q->rng, get_random_ul ()); MEMORY_UNTAG; q->use_ntt = 1; q->stop_asap = NULL; q->batch = 0; /* no batch mode by default in library mode */ q->batch_B1 = 1.0; mpz_init_set_ui(q->batch_s, 1); q->gw_k = 0.0; q->gw_b = 0; q->gw_n = 0; q->gw_c = 0; } void ecm_clear (ecm_params q) { mpz_clear (q->x); mpz_clear (q->sigma); mpz_clear (q->go); mpz_clear (q->B2min); mpz_clear (q->B2); gmp_randclear (q->rng); mpz_clear (q->batch_s); } /* returns ECM_FACTOR_FOUND, ECM_NO_FACTOR_FOUND, or ECM_ERROR */ int ecm_factor (mpz_t f, mpz_t n, double B1, ecm_params p) { int res; /* return value */ int p_is_null; ecm_params q; double B1done, B2scale; if ((p_is_null = (p == NULL))) { p = q; ecm_init (q); } /* Ugly hack to pass B2scale to the library somehow. It gets piggy-backed onto B1done. The next major release will have to allow for variable length parameter structs. */ B1done = floor (p->B1done); B2scale = (p->B1done - B1done) * 1048576.; p->B1done = B1done; if (p->method == ECM_ECM) res = ecm (f, p->x, p->sigma, n, p->go, &(p->B1done), B1, p->B2min, p->B2, B2scale, p->k, p->S, p->verbose, p->repr, p->nobase2step2, p->use_ntt, p->sigma_is_A, p->os, p->es, p->chkfilename, p->TreeFilename, p->maxmem, p->stage1time, p->rng, p->stop_asap, p->batch, p->batch_s, p->gw_k, p->gw_b, p->gw_n, p->gw_c); else if (p->method == ECM_PM1) res = pm1 (f, p->x, n, p->go, &(p->B1done), B1, p->B2min, p->B2, B2scale, p->k, p->S, p->verbose, p->repr, p->use_ntt, p->os, p->es, p->chkfilename, p->TreeFilename, p->maxmem, p->rng, p->stop_asap); else if (p->method == ECM_PP1) res = pp1 (f, p->x, n, p->go, &(p->B1done), B1, p->B2min, p->B2, B2scale, p->k, p->S, p->verbose, p->repr, p->use_ntt, p->os, p->es, p->chkfilename, p->TreeFilename, p->maxmem, p->rng, p->stop_asap); else { fprintf (p->es, "Error, unknown method: %d\n", p->method); res = ECM_ERROR; } if (p_is_null) ecm_clear (q); return res; } ecm-6.4.4/test.pm10000755023561000001540000000736312106741274010631 00000000000000#!/bin/sh # test file for P-1 method # # Copyright 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 # Paul Zimmermann, Alexander Kruppa, Dave Newman, Jim Fougeron. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, see # http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., # 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. PM1="$1 -pm1" # Call with "checkcode $? n" to check that return code is n # the return code is (see ecm-ecm.h): # 0: no factor found # 1: error (for example out of memory) # 2: composite factor found with composite cofactor # 6: prime factor found with composite cofactor # 8: input number found # 10: composite factor found with prime cofactor # 14: prime factor found with prime cofactor checkcode () { if [ $1 != $2 ] then echo "############### ERROR ###############" echo "Expected return code $2 but got $1" exit 1 fi } ### bug in ecm-5.0 found by Jay Berg (overflow in i0*d) echo 441995541378330835457 | $PM1 -x0 3 157080 7e9-72e8; checkcode $? 8 ### stage 2 less than 10^9. Input is prime factor of 2^731-1 ### echo 335203548019575991076297 | $PM1 -x0 2 23 31; checkcode $? 8 ### stage 2 of length 1 ### echo 335203548019575991076297 | $PM1 -x0 3 31 58766400424189339249-58766400424189339249; checkcode $? 8 # try primes < d in stage 2 echo 2050449353925555290706354283 | $PM1 -k 1 20 0-1e6; checkcode $? 14 # This factor was missed by an early development version of stage 2 echo 67872792749091946529 | $PM1 -x0 3 8467 11004397; checkcode $? 8 echo 5735039483399104015346944564789 | $PM1 1277209 9247741; checkcode $? 8 echo 620224739362954187513 | $PM1 -x0 3 668093 65087177; checkcode $? 8 echo 1405929742229533753 | $PM1 1123483 75240667; checkcode $? 8 echo 16811052664235873 | $PM1 -x0 3 19110 178253039; checkcode $? 8 echo 9110965748024759967611 | $PM1 1193119 316014211; checkcode $? 8 echo 563796628294674772855559264041716715663 | $PM1 4031563 14334623; checkcode $? 8 echo 188879386195169498836498369376071664143 | $PM1 3026227 99836987; checkcode $? 8 # factor of 909*9^909+1 found by Paul Leyland on 15 Nov 2002 echo 474476178924594486566271953891 | $PM1 9594209 519569569; checkcode $? 8 ### stage 2 less than 10^10 ### echo 2124306045220073929294177 | $PM1 290021 1193749003; checkcode $? 8 ### Try saving and resuming echo 25591172394760497166702530699464321 | $PM1 -save test.pm1.save 100000 checkcode $? 0 $PM1 -resume test.pm1.save 120557 2007301 C=$? /bin/rm -f test.pm1.save checkcode $C 8 # bug in ecm-5.0 (overflow in fin_diff_coeff) echo 504403158265489337 | $PM1 -k 4 8 9007199254740700-9007199254740900; checkcode $? 8 # check that primes near B2min are covered echo 6857 | $PM1 840 857; checkcode $? 8 # A test with a larger input number to test modular arithmetic routines not # in mulredc*.asm. This input has 1363 bits so it has 22 64-bit words # (43 32-bit words) and cannot use mulredc which handles only up to 20 limbs echo "10090030271*10^400+696212088699" | $PM1 2e3 2e6; checkcode $? 14 # check bug fixed in revision 1378 echo "2^(64*2)-1" | $PM1 -redc -x0 -1 2 1; checkcode $? 8 # check bug fixed in revision 2068 echo "234^997+997^234" | $PM1 -ntt 100 324; checkcode $? 0 echo "All P-1 tests are ok." ecm-6.4.4/memory.c0000644023561000001540000001553612106741273010704 00000000000000/* Memory allocation used during tests. Copyright 2001, 2002, 2003, 2005, 2006 Free Software Foundation, Inc. This file was copied from the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include #include /* for abort */ #include #include "ecm.h" void *__gmp_default_allocate (size_t); void *__gmp_default_reallocate (void *, size_t, size_t); void __gmp_default_free (void *, size_t); /* Each block allocated is a separate malloc, for the benefit of a redzoning malloc debugger during development or when bug hunting. Sizes passed when reallocating or freeing are checked (the default routines don't care about these). Memory leaks are checked by requiring that all blocks have been freed when tests_memory_end() is called. Test programs must be sure to have "clear"s for all temporary variables used. */ #define NAME_LEN 8 struct header { void *ptr; size_t size; char name[NAME_LEN]; unsigned int line; struct header *next; }; struct header *tests_memory_list = NULL; static unsigned long nr_realloc = 0, nr_realloc_move = 0; static char cur_name[NAME_LEN]; static unsigned int cur_line; static unsigned long cur_mem, peak_mem; /* Return a pointer to a pointer to the found block (so it can be updated when unlinking). */ static struct header ** tests_memory_find (void *ptr) { struct header **hp; for (hp = &tests_memory_list; *hp != NULL; hp = &((*hp)->next)) if ((*hp)->ptr == ptr) return hp; return NULL; } #if 0 static int tests_memory_valid (void *ptr) { return (tests_memory_find (ptr) != NULL); } #endif static void * tests_allocate (size_t size) { struct header *h; int i; if (size == 0) { printf ("tests_allocate(): attempt to allocate 0 bytes\n"); abort (); } if (cur_name[0] == 0) cur_name[1] = 0; /* Set breakpoint here to find untagged allocs */ h = (struct header *) __gmp_default_allocate (sizeof (*h)); h->next = tests_memory_list; tests_memory_list = h; h->size = size; h->ptr = (struct header*) __gmp_default_allocate (size); for (i = 0; i < NAME_LEN; i++) h->name[i] = cur_name[i]; h->line = cur_line; cur_mem += size; if (cur_mem > peak_mem) peak_mem = cur_mem; return h->ptr; } static void * tests_reallocate (void *ptr, size_t old_size, size_t new_size) { struct header **hp, *h; if (new_size == 0) { printf ("tests_reallocate(): attempt to reallocate 0x%lX to 0 bytes\n", (unsigned long) ptr); abort (); } hp = tests_memory_find (ptr); if (hp == NULL) { printf ("tests_reallocate(): attempt to reallocate bad pointer 0x%lX\n", (unsigned long) ptr); abort (); } h = *hp; if (h->size != old_size) { printf ("tests_reallocate(): bad old size %zd, should be %zd\n", old_size, h->size); abort (); } if (h->size > cur_mem) { printf ("tests_reallocate(): h->size = %zd but cur_mem = %lu\n", h->size, cur_mem); abort(); } cur_mem = cur_mem - h->size + new_size; if (cur_mem > peak_mem) peak_mem = cur_mem; #if 0 printf ("Reallocating %p, first allocated in %s, line %d, from %d to %d\n", ptr, h->name, h->line, h->size, new_size); if (new_size <= h->size) printf ("Unnecessary realloc!\n"); #endif nr_realloc++; h->size = new_size; h->ptr = (struct header*) __gmp_default_reallocate (ptr, old_size, new_size); if (h->ptr != ptr) nr_realloc_move++; return h->ptr; } static struct header ** tests_free_find (void *ptr) { struct header **hp = tests_memory_find (ptr); if (hp == NULL) { printf ("tests_free(): attempt to free bad pointer 0x%lX\n", (unsigned long) ptr); abort (); } return hp; } static void tests_free_nosize (void *ptr) { struct header **hp = tests_free_find (ptr); struct header *h = *hp; if (h->size > cur_mem) { printf ("tests_free_nosize(): h->size = %zd but cur_mem = %lu\n", h->size, cur_mem); abort(); } cur_mem -= h->size; *hp = h->next; /* unlink */ __gmp_default_free (ptr, h->size); __gmp_default_free (h, sizeof (*h)); } void tests_free (void *ptr, size_t size) { struct header **hp = tests_free_find (ptr); struct header *h = *hp; if (h->size != size) { printf ("tests_free(): bad size %zd, should be %zd\n", size, h->size); abort (); } tests_free_nosize (ptr); } void tests_memory_start (void) { mp_set_memory_functions (tests_allocate, tests_reallocate, tests_free); cur_name[0] = 0; cur_line = 0; cur_mem = 0L; peak_mem = 0L; } void tests_memory_reset (void) { mp_set_memory_functions (__gmp_default_allocate, __gmp_default_reallocate, __gmp_default_free); } void tests_memory_end (void) { if (tests_memory_list != NULL) { struct header *h; unsigned count; printf ("tests_memory_end(): not all memory freed\n"); count = 0; for (h = tests_memory_list; h != NULL; h = h->next) { count++; printf ("Memory at %p, allocated by %s, line %d\n", h->ptr, h->name, h->line); } printf (" %u block(s) remaining\n", count); abort (); } if (cur_mem != 0) { printf ("tests_memory_end(): cur_mem = %lu but list of allocated " "memory empty\n", cur_mem); abort (); } printf ("%lu reallocates, %lu reallocates with move, peak_mem = %lu\n", nr_realloc, nr_realloc_move, peak_mem); } void tests_memory_status (void) { unsigned count = 0, size = 0; if (tests_memory_list != NULL) { struct header *h; for (h = tests_memory_list; h != NULL; h = h->next) { count++; size += h->size; } } if (size != cur_mem) { printf ("tests_memory_status(): size = %d but cur_mem = %lu", size, cur_mem); abort(); } printf (" %u blocks remaining, total size %u\n", count, size); } void tests_memory_set_location (char *name, unsigned int line) { unsigned int i; for (i = 0; i < NAME_LEN; i++) cur_name[i] = name[i]; cur_line = line; } ecm-6.4.4/auxarith.c0000644023561000001540000000520112106741273011205 00000000000000/* Auxiliary arithmetic routines on unsigned long ints for the ecm library. Copyright 2001, 2002, 2003, 2004, 2005, 2007, 2008 Paul Zimmermann and Alexander Kruppa. This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "ecm-impl.h" /* Returns the gcd of a and b */ unsigned long gcd (unsigned long a, unsigned long b) { unsigned long t; while (b != 0UL) { t = a % b; a = b; b = t; } return a; } /* returns Euler's totient phi function */ unsigned long eulerphi (unsigned long n) { unsigned long phi = 1UL, p; for (p = 2UL; p * p <= n; p += 2UL) { if (n % p == 0UL) { phi *= p - 1UL; n /= p; while (n % p == 0UL) { phi *= p; n /= p; } } if (p == 2UL) p--; } /* now n is prime or 1 */ return (n == 1UL) ? phi : phi * (n - 1UL); } /* returns ceil(log(n)/log(2)) */ unsigned int ceil_log2 (unsigned long n) { unsigned int k = 0; ASSERT (n > 0UL); n--; while (n) { k++; n >>= 1; } return k; } /* Simple, slow methods for testing / finding primes */ int is_prime (const unsigned long n) { unsigned long i; if (n < 2UL) return 0; if (n % 2UL == 0UL) return n == 2UL; for (i = 3UL; i*i <= n; i += 2UL) if (n % i == 0UL) return 0; return 1; } /* Returns the smallest prime larger than n */ unsigned long next_prime (const unsigned long n) { unsigned long m; if (n < 2UL) return 2UL; if (n == 2UL) return 3UL; m = n + 2UL; while (! is_prime (m)) m += 2UL; return m; } /* Returns the smallest prime factor of N. If N == 1, return 1. */ unsigned long find_factor (const unsigned long N) { unsigned long i; ASSERT_ALWAYS (N != 0UL); if (N == 1UL) return 1UL; if (N % 2UL == 0UL) return 2UL; for (i = 3UL; i*i <= N; i += 2UL) if (N % i == 0UL) return i; return N; }