coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

factor improvements by Torbjörn and Niels: final warning


From: Jim Meyering
Subject: factor improvements by Torbjörn and Niels: final warning
Date: Fri, 05 Oct 2012 10:58:31 +0200

The factor series I've posted here so far have not been bisectable,
and that was a problem.  We must really make an effort to ensure
that any multi-commit series is at least compilable after each commit.
Better still, ensure that the code builds and passes "make check"
or even "make syntax-check" or dist-check.

Here I drew the line at "make check", which does pass after each of
these commits.  "make syntax-check" does not pass for the first few,
but does towards the middle, and of course at the end.

To achieve that, I added the new code in factor-ng.c initially, made
improvements to that file in the series, and towards the end, only once
all new rules and tests were in place did I move factor-ng.c to factor.c.
I've also been careful to list both Niels and Torbjörn as Author:
and in "Co-authored-by: ..." (in commit log) for each of the commits
that they contributed.  Torbjörn, Neils, FYI, those log comments are
automatically converted to ChangeLog entries that list both author names.
That's the ChangeLog that appears in each release tarball.

I will push this later today.
Feedback welcome.

 a/src/factor-ng.c       | 2396 ----------------------------------------------
 b/.gitignore            |    6
 b/AUTHORS               |    2
 b/NEWS                  |    7
 b/cfg.mk                |   23
 b/src/.gitignore        |    2
 b/src/factor-ng.c       | 2388 +++++++++++++++++++++++++++++++++++++++++++++
 b/src/factor.c          | 2497 ++++++++++++++++++++++++++++++++++++++++++------
 b/src/local.mk          |   31
 b/src/longlong.h        | 2156 +++++++++++++++++++++++++++++++++++++++++
 b/src/make-prime-list.c |  168 +++
 b/tests/factor/run.sh   |   30
 b/tests/local.mk        |   71 +
 src/factor-ng.c         |  124 +-
 src/factor.c            |    6
 src/longlong.h          |    6
 src/make-prime-list.c   |   30
 src/wheel-gen.pl        |  114 --

Summary:

  [1/8] factor: prepare for the new factor program
  [2/8] maint: make-prime-list: avoid -Wsuggest-attribute=const
  [3/8] maint: make-prime-list: do not ignore write failure
  [4/8] maint: make-prime-list: syntax conventions; be robust
  [5/8] factor: new much-improved implementation; not yet
  [6/8] factor: more improvements
  [7/8] factor: merge with preexisting factor; integrate tests;
  [8/8] factor: 25% speed-up, on output

>From 1fb5db95aaddc8216449052f11c5da05c8571db5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torbj=C3=B6rn=20Granlund?= <address@hidden>
Date: Sun, 16 Sep 2012 21:59:40 +0200
Subject: [PATCH 1/8] factor: prepare for the new factor program
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/make-prime-list.c: New file, from nt-factor.
Co-authored-by: Niels Möller <address@hidden>
---
 src/make-prime-list.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 src/make-prime-list.c

diff --git a/src/make-prime-list.c b/src/make-prime-list.c
new file mode 100644
index 0000000..1f5b3ce
--- /dev/null
+++ b/src/make-prime-list.c
@@ -0,0 +1,168 @@
+/* Factoring of uintmax_t numbers. Generation of needed tables.
+
+   Contributed to the GNU project by Torbjörn Granlund and Niels Möller
+   Contains code from GNU MP.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see http://www.gnu.org/licenses/.  */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+struct prime
+{
+  unsigned p;
+  uintmax_t pinv; /* Inverse mod b = 2^{bitsize of uintmax_t} */
+  uintmax_t lim; /* floor(UINTMAX_MAX / p) */
+};
+
+static uintmax_t
+binvert (uintmax_t a)
+{
+  uintmax_t x = 0xf5397db1 >> (4*((a/2) & 0x7));
+  for (;;)
+    {
+      uintmax_t y = 2*x - x*x*a;
+      if (y == x)
+        return x;
+      x = y;
+    }
+}
+
+static void
+process_prime (struct prime *info, unsigned p)
+{
+  info->p = p;
+  info->pinv = binvert (p);
+  info->lim = UINTMAX_MAX / (uintmax_t) p;
+}
+
+static void
+output_primes (const struct prime *primes, unsigned nprimes)
+{
+  unsigned i;
+  unsigned p;
+  int is_prime;
+  const char *suffix;
+
+  puts ("/* Generated file -- DO NOT EDIT */\n");
+
+  if (sizeof(uintmax_t) <= sizeof(unsigned long))
+    suffix = "UL";
+  else if (sizeof(uintmax_t) <= sizeof(unsigned long long))
+    suffix = "ULL";
+  else
+    {
+      fprintf (stderr, "Don't know how to write uintmax_t constants.\n");
+      exit (EXIT_FAILURE);
+    }
+
+#define SZ (int)(2*sizeof(uintmax_t))
+
+  for (i = 0, p = 2; i < nprimes; i++)
+    {
+      printf ("P(%2u, %3u, 0x%0*jx%s, 0x%0*jx%s) /* %d */\n",
+              primes[i].p - p,
+              (i + 8 < nprimes) ? primes[i + 8].p - primes[i].p : 0xff,
+              SZ, primes[i].pinv, suffix,
+              SZ, primes[i].lim, suffix, primes[i].p);
+      p = primes[i].p;
+    }
+
+  printf ("\n#undef FIRST_OMITTED_PRIME\n");
+
+  /* Find next prime */
+  do
+    {
+      p += 2;
+      for (i = 0, is_prime = 1; is_prime; i++)
+        {
+          if (primes[i].p * primes[i].p > p)
+            break;
+          if ( (uintmax_t) p * primes[i].pinv <= primes[i].lim)
+            {
+              is_prime = 0;
+              break;
+            }
+        }
+    }
+  while (!is_prime);
+
+  printf ("#define FIRST_OMITTED_PRIME %d\n", p);
+}
+
+static void *
+xalloc (size_t s)
+{
+  void *p = malloc(s);
+  if (p)
+    return p;
+
+  fprintf (stderr, "Virtual memory exhausted.\n");
+  exit (EXIT_FAILURE);
+}
+
+int
+main (int argc, char **argv)
+{
+  int limit;
+
+  char *sieve;
+  size_t size, i;
+
+  struct prime *prime_list;
+  unsigned nprimes;
+
+  if (argc != 2)
+    {
+      fprintf (stderr, "Usage: %s LIMIT\n"
+               "Produces a list of odd primes <= LIMIT\n", argv[0]);
+      return EXIT_FAILURE;
+    }
+  limit = atoi(argv[1]);
+  if (limit < 3)
+    exit (EXIT_SUCCESS);
+
+  /* Make limit odd */
+  if ( !(limit & 1))
+    limit--;
+
+  size = (limit-1)/2;
+  /* sieve[i] represents 3+2*i */
+  sieve = xalloc (size);
+  memset (sieve, 1, size);
+
+  prime_list = xalloc (size * sizeof (*prime_list));
+  nprimes = 0;
+
+  for (i = 0; i < size;)
+    {
+      unsigned p = 3+2*i;
+      unsigned j;
+
+      process_prime (&prime_list[nprimes++], p);
+
+      for (j = (p*p - 3)/2; j < size; j+= p)
+        sieve[j] = 0;
+
+      while (i < size && sieve[++i] == 0)
+        ;
+    }
+
+  output_primes (prime_list, nprimes);
+
+  return EXIT_SUCCESS;
+}
-- 
1.8.0.rc0.18.gf84667d


>From cf67e4cc9025ce921230b821cb7e75bce06876cd Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Wed, 19 Sep 2012 11:07:41 +0200
Subject: [PATCH 2/8] maint: make-prime-list: avoid -Wsuggest-attribute=const
 warning

* src/make-prime-list.c: Include <config.h>.
(binvert): Add _GL_ATTRIBUTE_CONST.
---
 src/make-prime-list.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/make-prime-list.c b/src/make-prime-list.c
index 1f5b3ce..724924b 100644
--- a/src/make-prime-list.c
+++ b/src/make-prime-list.c
@@ -17,6 +17,8 @@ PARTICULAR PURPOSE.  See the GNU General Public License for 
more details.
 You should have received a copy of the GNU General Public License along with
 this program.  If not, see http://www.gnu.org/licenses/.  */

+#include <config.h>
+
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -29,7 +31,7 @@ struct prime
   uintmax_t lim; /* floor(UINTMAX_MAX / p) */
 };

-static uintmax_t
+static uintmax_t _GL_ATTRIBUTE_CONST
 binvert (uintmax_t a)
 {
   uintmax_t x = 0xf5397db1 >> (4*((a/2) & 0x7));
-- 
1.8.0.rc0.18.gf84667d


>From c6369e9004be56d427f0cc0d51d7009d727e061c Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Sun, 23 Sep 2012 11:33:01 +0200
Subject: [PATCH 3/8] maint: make-prime-list: do not ignore write failure

Even though this is just a helper program that is run solely to create
primes.h, it should not ignore a write failure.  Normally we would
simply call atexit (close_stdout), but we cannot do that from this
helper program, since it must be built before the generated header,
primes.h.  If we were to make the linking of make-prime-list depend
on libcoreutils.a, that would add all lib/*.o files to the list
of dependents of $(BUILT_HEADERS).  Then, since there is currently no
provision to ensure that a file like lib/stdio.h (another built header)
is built before the first lib/*.o file that also includes <stdio.h>,
some lib/*.o files would be built before lib/stdio.h and some after.
The former would provoke link failures due to undefined rpl_* functions.
* src/make-prime-list.c: Include <errno.h>.
(fclose): Undef, so that a definition to rpl_fclose does not
cause a link failure.
(main): Per the above, in this exceptional case, we check for fclose
and ferror failure manually, and don't worry about the ferror-only
failure case in which errno may not be relevant.
---
 src/make-prime-list.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/make-prime-list.c b/src/make-prime-list.c
index 724924b..b18b0e5 100644
--- a/src/make-prime-list.c
+++ b/src/make-prime-list.c
@@ -23,6 +23,8 @@ this program.  If not, see http://www.gnu.org/licenses/.  */
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <errno.h>
+#undef fclose

 struct prime
 {
@@ -166,5 +168,11 @@ main (int argc, char **argv)

   output_primes (prime_list, nprimes);

+  if (ferror (stdout) + fclose (stdout))
+    {
+      fprintf (stderr, "write error: %s\n", strerror (errno));
+      return EXIT_FAILURE;
+    }
+
   return EXIT_SUCCESS;
 }
-- 
1.8.0.rc0.18.gf84667d


>From d836ea73400d66ecfa5ecf1287b983976342c4c4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Thu, 4 Oct 2012 20:27:31 +0200
Subject: [PATCH 4/8] maint: make-prime-list: syntax conventions; be robust
 for large N

* src/make-prime-list.c: Insert spaces before parens.
(main): Abort if the 8-delta value ever exceeds 255.
---
 src/make-prime-list.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/make-prime-list.c b/src/make-prime-list.c
index b18b0e5..49db2ba 100644
--- a/src/make-prime-list.c
+++ b/src/make-prime-list.c
@@ -64,9 +64,9 @@ output_primes (const struct prime *primes, unsigned nprimes)

   puts ("/* Generated file -- DO NOT EDIT */\n");

-  if (sizeof(uintmax_t) <= sizeof(unsigned long))
+  if (sizeof (uintmax_t) <= sizeof (unsigned long))
     suffix = "UL";
-  else if (sizeof(uintmax_t) <= sizeof(unsigned long long))
+  else if (sizeof (uintmax_t) <= sizeof (unsigned long long))
     suffix = "ULL";
   else
     {
@@ -74,13 +74,15 @@ output_primes (const struct prime *primes, unsigned nprimes)
       exit (EXIT_FAILURE);
     }

-#define SZ (int)(2*sizeof(uintmax_t))
+#define SZ (int)(2*sizeof (uintmax_t))

   for (i = 0, p = 2; i < nprimes; i++)
     {
-      printf ("P(%2u, %3u, 0x%0*jx%s, 0x%0*jx%s) /* %d */\n",
-              primes[i].p - p,
-              (i + 8 < nprimes) ? primes[i + 8].p - primes[i].p : 0xff,
+      unsigned int d8 = i + 8 < nprimes ? primes[i + 8].p - primes[i].p : 0xff;
+      if (255 < d8) /* this happens at 668221 */
+        abort ();
+      printf ("P (%2u, %3u, 0x%0*jx%s, 0x%0*jx%s) /* %d */\n",
+              primes[i].p - p, d8,
               SZ, primes[i].pinv, suffix,
               SZ, primes[i].lim, suffix, primes[i].p);
       p = primes[i].p;
@@ -111,7 +113,7 @@ output_primes (const struct prime *primes, unsigned nprimes)
 static void *
 xalloc (size_t s)
 {
-  void *p = malloc(s);
+  void *p = malloc (s);
   if (p)
     return p;

@@ -136,7 +138,7 @@ main (int argc, char **argv)
                "Produces a list of odd primes <= LIMIT\n", argv[0]);
       return EXIT_FAILURE;
     }
-  limit = atoi(argv[1]);
+  limit = atoi (argv[1]);
   if (limit < 3)
     exit (EXIT_SUCCESS);

-- 
1.8.0.rc0.18.gf84667d


>From a38890b27fe296c582d1266cb5b3b02f186132dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torbj=C3=B6rn=20Granlund?= <address@hidden>
Date: Sun, 16 Sep 2012 22:27:20 +0200
Subject: [PATCH 5/8] factor: new much-improved implementation; not yet
 integrated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/factor-ng.c: New file, from nt-factor.
* src/longlong.h: New file.
* NEWS (Improvements): Mention the upcoming improvements.
Co-authored-by: Niels Möller
---
 NEWS            |    7 +
 src/factor-ng.c | 2388 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/longlong.h  | 2156 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 4551 insertions(+)
 create mode 100644 src/factor-ng.c
 create mode 100644 src/longlong.h

diff --git a/NEWS b/NEWS
index 25f1a27..aff5bf1 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,13 @@ GNU coreutils NEWS                                    -*- 
outline -*-

 ** Improvements

+  factor's core has been rewritten for speed and increased range.
+  It can now factor numbers up to 2^128, even without GMP support.
+  Its speed is from a few times better (for small numbers) to over
+  10,000 times better (just below 2^64).  The new code also runs a
+  deterministic primality test for each prime factor, not just a
+  probabilistic test.
+
   seq is now up to 70 times faster than it was in coreutils-8.19 and prior,
   but only with non-negative whole numbers, an increment of 1, and no
   format-changing options.
diff --git a/src/factor-ng.c b/src/factor-ng.c
new file mode 100644
index 0000000..776aaa6
--- /dev/null
+++ b/src/factor-ng.c
@@ -0,0 +1,2388 @@
+/* Factoring of uintmax_t numbers.
+
+   Contributed to the GNU project by Torbjörn Granlund and Niels Möller
+   Contains code from GNU MP.
+
+Copyright 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2005, 2009, 2012
+Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program.  If not, see http://www.gnu.org/licenses/.  */
+
+/* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
+   or, with GMP, numbers of any size.
+
+  Code organisation:
+
+    There are several variants of many functions, for handling one word, two
+    words, and GMP's mpz_t type.  If the one-word variant is called foo, the
+    two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
+    some cases, the plain function variants will handle both one-word and
+    two-word numbers, evidenced by function arguments.
+
+    The factoring code for two words will fall into the code for one word when
+    progress allows that.
+
+    Using GMP is optional.  Define HAVE_GMP to make this code include GMP
+    factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
+    (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
+    factoring code even if numbers get small enough for using the two-word
+    code.
+
+  Algorithm:
+
+    (1) Perform trial division using a small primes table, but without hardware
+        division since the primes table store inverses modulo the word base.
+        (The GMP variant of this code doesn't make use of the precomputed
+        inverses, but instead relies on GMP for fast divisibility testing.)
+    (2) Check the nature of any non-factored part using Miller-Rabin for
+        detecting composites, and Lucas for detecting primes.
+    (3) Factor any remaining composite part using the Pollard-Brent rho
+        algorithm or the SQUFOF algorithm, checking status of found factors
+        again using Miller-Rabin and Lucas.
+
+    We prefer using Hensel norm in the divisions, not the more familiar
+    Euclidian norm, since the former leads to much faster code.  In the
+    Pollard-Brent rho code and the the prime testing code, we use Montgomery's
+    trick of multiplying all n-residues by the word base, allowing cheap Hensel
+    reductions mod n.
+
+  Improvements:
+
+    * Use modular inverses also for exact division in the Lucas code, and
+      elsewhere.  A problem is to locate the inverses not from an index, but
+      from a prime.  We might instead compute the inverse on-the-fly.
+
+    * Tune trial division table size (not forgetting that this is a standalone
+      program where the table will be read from disk for each invocation).
+
+    * Implement less naive powm, using k-ary exponentiation for k = 3 or
+      perhaps k = 4.
+
+    * Try to speed trial division code for single uintmax_t numbers, i.e., the
+      code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
+      IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
+      using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
+      respectively cycles ought to be possible.
+
+    * The redcify function could be vastly improved by using (plain Euclidian)
+      pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
+      GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
+      similar methoods.  These functions currently dominate run time when using
+      the -w option.
+*/
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>             /* for memmove */
+#include <unistd.h>             /* for getopt */
+#include <stdbool.h>
+
+#include "system.h"
+
+#if HAVE_GMP
+# include <gmp.h>
+#endif
+
+#ifndef STAT_SQUFOF
+# define STAT_SQUFOF 0
+#endif
+
+#ifndef USE_LONGLONG_H
+# define USE_LONGLONG_H 1
+#endif
+
+#if USE_LONGLONG_H
+
+/* Make definitions for longlong.h to make it do what it can do for us */
+# define W_TYPE_SIZE 64          /* bitcount for uintmax_t */
+# define UWtype  uintmax_t
+# define UHWtype unsigned long int
+# undef UDWtype
+# if HAVE_ATTRIBUTE_MODE
+typedef unsigned int UQItype    __attribute__ ((mode (QI)));
+typedef          int SItype     __attribute__ ((mode (SI)));
+typedef unsigned int USItype    __attribute__ ((mode (SI)));
+typedef          int DItype     __attribute__ ((mode (DI)));
+typedef unsigned int UDItype    __attribute__ ((mode (DI)));
+# else
+typedef unsigned char UQItype;
+typedef          long SItype;
+typedef unsigned long int USItype;
+#  if HAVE_LONG_LONG
+typedef long long int DItype;
+typedef unsigned long long int UDItype;
+#  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
+typedef long int DItype;
+typedef unsigned long int UDItype;
+#  endif
+# endif
+# define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files 
*/
+# define ASSERT(x)               /* FIXME make longlong.h really standalone */
+# define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
+# define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
+# ifndef __GMP_GNUC_PREREQ
+#  define __GMP_GNUC_PREREQ(a,b) 1
+# endif
+# if _ARCH_PPC
+#  define HAVE_HOST_CPU_FAMILY_powerpc 1
+# endif
+# include "longlong.h"
+# ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+const unsigned char factor_clz_tab[129] =
+{
+  1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  9
+};
+# endif
+
+#else /* not USE_LONGLONG_H */
+
+# define W_TYPE_SIZE (8 * sizeof (uintmax_t))
+# define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
+# define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
+# define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
+
+#endif
+
+enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
+
+static enum alg_type alg;
+
+/* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
+#define MAX_NFACTS 26
+
+struct factors
+{
+  uintmax_t     plarge[2]; /* Can have a single large factor */
+  uintmax_t     p[MAX_NFACTS];
+  unsigned char e[MAX_NFACTS];
+  unsigned char nfactors;
+};
+
+#if HAVE_GMP
+struct mp_factors
+{
+  mpz_t         *p;
+  unsigned long int *e;
+  unsigned long int nfactors;
+};
+#endif
+
+static void factor (uintmax_t, uintmax_t, struct factors *);
+
+#ifndef umul_ppmm
+# define umul_ppmm(w1, w0, u, v)                                         \
+  do {                                                                  \
+    uintmax_t __x0, __x1, __x2, __x3;                                   \
+    unsigned long int __ul, __vl, __uh, __vh;                           \
+    uintmax_t __u = (u), __v = (v);                                     \
+                                                                        \
+    __ul = __ll_lowpart (__u);                                          \
+    __uh = __ll_highpart (__u);                                         \
+    __vl = __ll_lowpart (__v);                                          \
+    __vh = __ll_highpart (__v);                                         \
+                                                                        \
+    __x0 = (uintmax_t) __ul * __vl;                                     \
+    __x1 = (uintmax_t) __ul * __vh;                                     \
+    __x2 = (uintmax_t) __uh * __vl;                                     \
+    __x3 = (uintmax_t) __uh * __vh;                                     \
+                                                                        \
+    __x1 += __ll_highpart (__x0);/* this can't give carry */            \
+    __x1 += __x2;               /* but this indeed can */               \
+    if (__x1 < __x2)            /* did we get it? */                    \
+      __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
+                                                                        \
+    (w1) = __x3 + __ll_highpart (__x1);                                 \
+    (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
+  } while (0)
+#endif
+
+#if !defined(udiv_qrnnd) || defined(UDIV_NEEDS_NORMALIZATION)
+/* Define our own, not needing normalization. This function is
+   currently not performance critical, so keep it simple. Similar to
+   the mod macro below. */
+# undef udiv_qrnnd
+# define udiv_qrnnd(q, r, n1, n0, d)                                     \
+  do {                                                                  \
+    uintmax_t __d1, __d0, __q, __r1, __r0;                              \
+                                                                        \
+    assert ((n1) < (d));                                                \
+    __d1 = (d); __d0 = 0;                                               \
+    __r1 = (n1); __r0 = (n0);                                           \
+    __q = 0;                                                            \
+    for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
+      {                                                                 \
+        rsh2 (__d1, __d0, __d1, __d0, 1);                               \
+        __q <<= 1;                                                      \
+        if (ge2 (__r1, __r0, __d1, __d0))                               \
+          {                                                             \
+            __q++;                                                      \
+            sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
+          }                                                             \
+      }                                                                 \
+    (r) = __r0;                                                         \
+    (q) = __q;                                                          \
+  } while (0)
+#endif
+
+#if !defined (add_ssaaaa)
+# define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
+  do {                                                                  \
+    uintmax_t _add_x;                                                   \
+    _add_x = (al) + (bl);                                               \
+    (sh) = (ah) + (bh) + (_add_x < (al));                               \
+    (sl) = _add_x;                                                      \
+  } while (0)
+#endif
+
+#define rsh2(rh, rl, ah, al, cnt)                                       \
+  do {                                                                  \
+    (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
+    (rh) = (ah) >> (cnt);                                               \
+  } while (0)
+
+#define lsh2(rh, rl, ah, al, cnt)                                       \
+  do {                                                                  \
+    (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
+    (rl) = (al) << (cnt);                                               \
+  } while (0)
+
+#define ge2(ah, al, bh, bl)                                             \
+  ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
+
+#define gt2(ah, al, bh, bl)                                             \
+  ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
+
+#ifndef sub_ddmmss
+# define sub_ddmmss(rh, rl, ah, al, bh, bl)                              \
+  do {                                                                  \
+    uintmax_t _cy;                                                      \
+    _cy = (al) < (bl);                                                  \
+    (rl) = (al) - (bl);                                                 \
+    (rh) = (ah) - (bh) - _cy;                                           \
+  } while (0)
+#endif
+
+#ifndef count_leading_zeros
+# define count_leading_zeros(count, x) do {                              \
+    uintmax_t __clz_x = (x);                                            \
+    unsigned int __clz_c;                                               \
+    for (__clz_c = 0;                                                   \
+         (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
+         __clz_c += 8)                                                  \
+      __clz_x <<= 8;                                                    \
+    for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
+      __clz_x <<= 1;                                                    \
+    (count) = __clz_c;                                                  \
+  } while (0)
+#endif
+
+#ifndef count_trailing_zeros
+# define count_trailing_zeros(count, x) do {                             \
+    uintmax_t __ctz_x = (x);                                            \
+    unsigned int __ctz_c = 0;                                           \
+    while ((__ctz_x & 1) == 0)                                          \
+      {                                                                 \
+        __ctz_x >>= 1;                                                  \
+        __ctz_c++;                                                      \
+      }                                                                 \
+    (count) = __ctz_c;                                                  \
+  } while (0)
+#endif
+
+/* Requires that a < n and b <= n */
+#define submod(r,a,b,n)                                                 \
+  do {                                                                  \
+    uintmax_t _t = - (uintmax_t) (a < b);                               \
+    (r) = ((n) & _t) + (a) - (b);                                       \
+  } while (0)
+
+#define addmod(r,a,b,n)                                                 \
+  submod((r), (a), ((n) - (b)), (n))
+
+/* Modular two-word addition and subtraction.  For performance reasons, the
+   most significant bit of n1 must be clear.  The destination variables must be
+   distinct from the mod operand.  */
+#define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
+  do {                                                                  \
+    add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
+    if (ge2 ((r1), (r0), (n1), (n0)))                                   \
+      sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
+  } while (0)
+#define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
+  do {                                                                  \
+    sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
+    if ((intmax_t) (r1) < 0)                                            \
+      add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
+  } while (0)
+
+#define HIGHBIT_TO_MASK(x)                                              \
+  (((intmax_t)-1 >> 1) < 0                                              \
+   ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
+   : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
+      ? UINTMAX_MAX : (uintmax_t) 0))
+
+/* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
+   Requires that d1 != 0.  */
+static uintmax_t
+mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
+{
+  int cntd, cnta;
+
+  assert (d1 != 0);
+
+  if (a1 == 0)
+    {
+      *r1 = 0;
+      return a0;
+    }
+
+  count_leading_zeros (cntd, d1);
+  count_leading_zeros (cnta, a1);
+  int cnt = cntd - cnta;
+  lsh2 (d1, d0, d1, d0, cnt);
+  for (int i = 0; i < cnt; i++)
+    {
+      if (ge2 (a1, a0, d1, d0))
+        sub_ddmmss (a1, a0, a1, a0, d1, d0);
+      rsh2 (d1, d0, d1, d0, 1);
+    }
+
+  *r1 = a1;
+  return a0;
+}
+
+static uintmax_t _GL_ATTRIBUTE_CONST
+gcd_odd (uintmax_t a, uintmax_t b)
+{
+  if ( (b & 1) == 0)
+    {
+      uintmax_t t = b;
+      b = a;
+      a = t;
+    }
+  if (a == 0)
+    return b;
+
+  /* Take out least significant one bit, to make room for sign */
+  b >>= 1;
+
+  for (;;)
+    {
+      uintmax_t t;
+      uintmax_t bgta;
+
+      while ((a & 1) == 0)
+        a >>= 1;
+      a >>= 1;
+
+      t = a - b;
+      if (t == 0)
+        return (a << 1) + 1;
+
+      bgta = HIGHBIT_TO_MASK (t);
+
+      /* b <-- min (a, b) */
+      b += (bgta & t);
+
+      /* a <-- |a - b| */
+      a = (t ^ bgta) - bgta;
+    }
+}
+
+static uintmax_t
+gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t 
b0)
+{
+  while ((a0 & 1) == 0)
+    rsh2 (a1, a0, a1, a0, 1);
+  while ((b0 & 1) == 0)
+    rsh2 (b1, b0, b1, b0, 1);
+
+  for (;;)
+    {
+      if ((b1 | a1) == 0)
+        {
+          *r1 = 0;
+          return gcd_odd (b0, a0);
+        }
+
+      if (gt2 (a1, a0, b1, b0))
+        {
+          sub_ddmmss (a1, a0, a1, a0, b1, b0);
+          do
+            rsh2 (a1, a0, a1, a0, 1);
+          while ((a0 & 1) == 0);
+        }
+      else if (gt2 (b1, b0, a1, a0))
+        {
+          sub_ddmmss (b1, b0, b1, b0, a1, a0);
+          do
+            rsh2 (b1, b0, b1, b0, 1);
+          while ((b0 & 1) == 0);
+        }
+      else
+        break;
+    }
+
+  *r1 = a1;
+  return a0;
+}
+
+static void
+factor_insert_multiplicity (struct factors *factors,
+                            uintmax_t prime, unsigned int m)
+{
+  unsigned int nfactors = factors->nfactors;
+  uintmax_t *p = factors->p;
+  unsigned char *e = factors->e;
+
+  /* Locate position for insert new or increment e.  */
+  int i;
+  for (i = nfactors - 1; i >= 0; i--)
+    {
+      if (p[i] <= prime)
+        break;
+    }
+
+  if (i < 0 || p[i] != prime)
+    {
+      for (int j = nfactors - 1; j > i; j--)
+        {
+          p[j + 1] = p[j];
+          e[j + 1] = e[j];
+        }
+      p[i + 1] = prime;
+      e[i + 1] = m;
+      factors->nfactors = nfactors + 1;
+    }
+  else
+    {
+      e[i] += m;
+    }
+}
+
+#define factor_insert(f, p) factor_insert_multiplicity(f, p, 1)
+
+static void
+factor_insert_large (struct factors *factors,
+                     uintmax_t p1, uintmax_t p0)
+{
+  if (p1 > 0)
+    {
+      assert (factors->plarge[1] == 0);
+      factors->plarge[0] = p0;
+      factors->plarge[1] = p1;
+    }
+  else
+    factor_insert (factors, p0);
+}
+
+#if HAVE_GMP
+static void mp_factor (mpz_t, struct mp_factors *);
+
+static void
+mp_factor_init (struct mp_factors *factors)
+{
+  factors->p = malloc (1);
+  factors->e = malloc (1);
+  factors->nfactors = 0;
+}
+
+static void
+mp_factor_clear (struct mp_factors *factors)
+{
+  for (unsigned int i = 0; i < factors->nfactors; i++)
+    mpz_clear (factors->p[i]);
+
+  free (factors->p);
+  free (factors->e);
+}
+
+static void
+mp_factor_insert (struct mp_factors *factors, mpz_t prime)
+{
+  unsigned long int nfactors = factors->nfactors;
+  mpz_t         *p  = factors->p;
+  unsigned long int *e  = factors->e;
+  long i;
+
+  /* Locate position for insert new or increment e.  */
+  for (i = nfactors - 1; i >= 0; i--)
+    {
+      if (mpz_cmp (p[i], prime) <= 0)
+        break;
+    }
+
+  if (i < 0 || mpz_cmp (p[i], prime) != 0)
+    {
+      p = realloc (p, (nfactors + 1) * sizeof p[0]);
+      e = realloc (e, (nfactors + 1) * sizeof e[0]);
+
+      mpz_init (p[nfactors]);
+      for (long j = nfactors - 1; j > i; j--)
+        {
+          mpz_set (p[j + 1], p[j]);
+          e[j + 1] = e[j];
+        }
+      mpz_set (p[i + 1], prime);
+      e[i + 1] = 1;
+
+      factors->p = p;
+      factors->e = e;
+      factors->nfactors = nfactors + 1;
+    }
+  else
+    {
+      e[i] += 1;
+    }
+}
+
+static void
+mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
+{
+  mpz_t pz;
+
+  mpz_init_set_ui (pz, prime);
+  mp_factor_insert (factors, pz);
+  mpz_clear (pz);
+}
+#endif /* HAVE_GMP */
+
+
+#define P(a,b,c,d) a,
+static const unsigned char primes_diff[] = {
+#include "primes.h"
+0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+#define PRIMES_PTAB_ENTRIES (sizeof(primes_diff) / sizeof(primes_diff[0]) - 8 
+ 1)
+
+#define P(a,b,c,d) b,
+static const unsigned char primes_diff8[] = {
+#include "primes.h"
+0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+struct primes_dtab
+{
+  uintmax_t binv, lim;
+};
+
+#define P(a,b,c,d) {c,d},
+static const struct primes_dtab primes_dtab[] = {
+#include "primes.h"
+{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+/* This flag is honoured just in the GMP code. */
+static int flag_verbose = 0;
+
+/* Prove primality or run probabilistic tests.  */
+static int flag_prove_primality = 1;
+
+/* Number of Miller-Rabin tests to run when not proving primality. */
+#define MR_REPS 25
+
+#define LIKELY(cond)    __builtin_expect ((cond) != 0, 1)
+#define UNLIKELY(cond)  __builtin_expect ((cond) != 0, 0)
+
+static void
+factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
+                      unsigned int off)
+{
+  for (unsigned int j = 0; j < off; j++)
+    p += primes_diff[i + j];
+  factor_insert (factors, p);
+}
+
+/* Trial division with odd primes uses the following trick.
+
+   Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
+   consider the case t < B (this is the second loop below).
+
+   From our tables we get
+
+     binv = p^{-1} (mod B)
+     lim = floor ( (B-1) / p ).
+
+   First assume that t is a multiple of p, t = q * p. Then 0 <= q <=
+   lim (and all quotients in this range occur for some t).
+
+   Then t = q * p is true also (mod B), and p is invertible we get
+
+     q = t * binv (mod B).
+
+   Next, assume that t is *not* divisible by p. Since multiplication
+   by binv (mod B) is a one-to-one mapping,
+
+     t * binv (mod B) > lim,
+
+   because all the smaller values are already taken.
+
+   This can be summed up by saying that the function
+
+     q(t) = binv * t (mod B)
+
+   is a permutation of the range 0 <= t < B, with the curious property
+   that it maps the multiples of p onto the range 0 <= q <= lim, in
+   order, and the non-multiples of p onto the range lim < q < B.
+ */
+
+static uintmax_t
+factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
+                       struct factors *factors)
+{
+  if (t0 % 2 == 0)
+    {
+      unsigned int cnt;
+
+      if (t0 == 0)
+        {
+          count_trailing_zeros (cnt, t1);
+          t0 = t1 >> cnt;
+          t1 = 0;
+          cnt += W_TYPE_SIZE;
+        }
+      else
+        {
+          count_trailing_zeros (cnt, t0);
+          rsh2 (t1, t0, t1, t0, cnt);
+        }
+
+      factor_insert_multiplicity (factors, 2, cnt);
+    }
+
+  uintmax_t p = 3;
+  unsigned int i;
+  for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
+    {
+      for (;;)
+        {
+          uintmax_t q1, q0, hi, lo;
+
+          q0 = t0 * primes_dtab[i].binv;
+          umul_ppmm (hi, lo, q0, p);
+          if (hi > t1)
+            break;
+          hi = t1 - hi;
+          q1 = hi * primes_dtab[i].binv;
+          if (LIKELY (q1 > primes_dtab[i].lim))
+            break;
+          t1 = q1; t0 = q0;
+          factor_insert (factors, p);
+        }
+      p += primes_diff[i + 1];
+    }
+  if (t1p)
+    *t1p = t1;
+
+#define DIVBLOCK(I)                                                     \
+  do {                                                                  \
+    for (;;)                                                            \
+      {                                                                 \
+        q = t0 * pd[I].binv;                                            \
+        if (LIKELY (q > pd[I].lim))                                     \
+          break;                                                        \
+        t0 = q;                                                         \
+        factor_insert_refind (factors, p, i + 1, I);                    \
+      }                                                                 \
+  } while (0)
+
+  for (; i < PRIMES_PTAB_ENTRIES; i += 8)
+    {
+      uintmax_t q;
+      const struct primes_dtab *pd = &primes_dtab[i];
+      DIVBLOCK (0);
+      DIVBLOCK (1);
+      DIVBLOCK (2);
+      DIVBLOCK (3);
+      DIVBLOCK (4);
+      DIVBLOCK (5);
+      DIVBLOCK (6);
+      DIVBLOCK (7);
+
+      p += primes_diff8[i];
+      if (p * p > t0)
+        break;
+    }
+
+  return t0;
+}
+
+#if HAVE_GMP
+static void
+mp_factor_using_division (mpz_t t, struct mp_factors *factors)
+{
+  mpz_t q;
+  unsigned long int p;
+
+  if (flag_verbose > 0)
+    {
+      printf ("[trial division] ");
+    }
+
+  mpz_init (q);
+
+  p = mpz_scan1 (t, 0);
+  mpz_div_2exp (t, t, p);
+  while (p)
+    {
+      mp_factor_insert_ui (factors, 2);
+      --p;
+    }
+
+  p = 3;
+  for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
+    {
+      if (! mpz_divisible_ui_p (t, p))
+        {
+          p += primes_diff[i++];
+          if (mpz_cmp_ui (t, p * p) < 0)
+            break;
+        }
+      else
+        {
+          mpz_tdiv_q_ui (t, t, p);
+          mp_factor_insert_ui (factors, p);
+        }
+    }
+
+  mpz_clear (q);
+}
+#endif
+
+/* Entry i contains (2i+1)^(-1) mod 2^8.  */
+static const unsigned char  binvert_table[128] =
+{
+  0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
+  0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
+  0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
+  0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
+  0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
+  0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
+  0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
+  0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
+  0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
+  0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
+  0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
+  0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
+  0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
+  0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
+  0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
+  0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
+};
+
+/* Compute n^(-1) mod B, using a Newton iteration.  */
+#define binv(inv,n)                                                     \
+  do {                                                                  \
+    uintmax_t  __n = (n);                                               \
+    uintmax_t  __inv;                                                   \
+                                                                        \
+    __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
+    if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
+    if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
+    if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
+                                                                        \
+    if (W_TYPE_SIZE > 64)                                               \
+      {                                                                 \
+        int  __invbits = 64;                                            \
+        do {                                                            \
+          __inv = 2 * __inv - __inv * __inv * __n;                      \
+          __invbits *= 2;                                               \
+        } while (__invbits < W_TYPE_SIZE);                              \
+      }                                                                 \
+                                                                        \
+    (inv) = __inv;                                                      \
+  } while (0)
+
+/* q = u / d, assuming d|u.  */
+#define divexact_21(q1, q0, u1, u0, d)                                  \
+  do {                                                                  \
+    uintmax_t _di, _q0;                                                 \
+    binv (_di, (d));                                                    \
+    _q0 = (u0) * _di;                                                   \
+    if ((u1) >= (d))                                                    \
+      {                                                                 \
+        uintmax_t _p1, _p0;                                             \
+        umul_ppmm (_p1, _p0, _q0, d);                                   \
+        (q1) = ((u1) - _p1) * _di;                                      \
+        (q0) = _q0;                                                     \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        (q0) = _q0;                                                     \
+        (q1) = 0;                                                       \
+      }                                                                 \
+  } while(0)
+
+/* x B (mod n). */
+#define redcify(r_prim, r, n)                                           \
+  do {                                                                  \
+    uintmax_t _redcify_q;                                               \
+    udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
+  } while (0)
+
+/* x B^2 (mod n). Requires x > 0, n1 < B/2 */
+#define redcify2(r1, r0, x, n1, n0)                                     \
+  do {                                                                  \
+    uintmax_t _r1, _r0, _i;                                             \
+    if ((x) < (n1))                                                     \
+      {                                                                 \
+        _r1 = (x); _r0 = 0;                                             \
+        _i = W_TYPE_SIZE;                                               \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        _r1 = 0; _r0 = (x);                                             \
+        _i = 2*W_TYPE_SIZE;                                             \
+      }                                                                 \
+    while (_i-- > 0)                                                    \
+      {                                                                 \
+        lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
+        if (ge2 (_r1, _r0, (n1), (n0)))                                 \
+          sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
+      }                                                                 \
+    (r1) = _r1;                                                         \
+    (r0) = _r0;                                                         \
+  } while (0)
+
+/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
+   Both a and b must be in redc form, the result will be in redc form too. */
+static inline uintmax_t
+mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
+{
+  uintmax_t rh, rl, q, th, tl, xh;
+
+  umul_ppmm (rh, rl, a, b);
+  q = rl * mi;
+  umul_ppmm (th, tl, q, m);
+  xh = rh - th;
+  if (rh < th)
+    xh += m;
+
+  return xh;
+}
+
+/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
+   Both a and b must be in redc form, the result will be in redc form too.
+   For performance reasons, the most significant bit of m must be clear. */
+static uintmax_t
+mulredc2 (uintmax_t *r1p,
+          uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
+          uintmax_t m1, uintmax_t m0, uintmax_t mi)
+{
+  uintmax_t r1, r0, q, p1, p0, t1, t0, s1, s0;
+  mi = -mi;
+  assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
+  assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
+  assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
+
+  /* First compute a0 * <b1, b0> B^{-1}
+        +-----+
+        |a0 b0|
+     +--+--+--+
+     |a0 b1|
+     +--+--+--+
+        |q0 m0|
+     +--+--+--+
+     |q0 m1|
+    -+--+--+--+
+     |r1|r0| 0|
+     +--+--+--+
+  */
+  umul_ppmm (t1, t0, a0, b0);
+  umul_ppmm (r1, r0, a0, b1);
+  q = mi * t0;
+  umul_ppmm (p1, p0, q, m0);
+  umul_ppmm (s1, s0, q, m1);
+  r0 += (t0 != 0); /* Carry */
+  add_ssaaaa (r1, r0, r1, r0, 0, p1);
+  add_ssaaaa (r1, r0, r1, r0, 0, t1);
+  add_ssaaaa (r1, r0, r1, r0, s1, s0);
+
+  /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
+        +-----+
+        |a1 b0|
+        +--+--+
+        |r1|r0|
+     +--+--+--+
+     |a1 b1|
+     +--+--+--+
+        |q1 m0|
+     +--+--+--+
+     |q1 m1|
+    -+--+--+--+
+     |r1|r0| 0|
+     +--+--+--+
+  */
+  umul_ppmm (t1, t0, a1, b0);
+  umul_ppmm (s1, s0, a1, b1);
+  add_ssaaaa (t1, t0, t1, t0, 0, r0);
+  q = mi * t0;
+  add_ssaaaa (r1, r0, s1, s0, 0, r1);
+  umul_ppmm (p1, p0, q, m0);
+  umul_ppmm (s1, s0, q, m1);
+  r0 += (t0 != 0); /* Carry */
+  add_ssaaaa (r1, r0, r1, r0, 0, p1);
+  add_ssaaaa (r1, r0, r1, r0, 0, t1);
+  add_ssaaaa (r1, r0, r1, r0, s1, s0);
+
+  if (ge2 (r1, r0, m1, m0))
+    sub_ddmmss (r1, r0, r1, r0, m1, m0);
+
+  *r1p = r1;
+  return r0;
+}
+
+static uintmax_t _GL_ATTRIBUTE_CONST
+powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
+{
+  uintmax_t y = one;
+
+  if (e & 1)
+    y = b;
+
+  while (e != 0)
+    {
+      b = mulredc (b, b, n, ni);
+      e >>= 1;
+
+      if (e & 1)
+        y = mulredc (y, b, n, ni);
+    }
+
+  return y;
+}
+
+static uintmax_t
+powm2 (uintmax_t *r1m,
+       const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
+       uintmax_t ni, const uintmax_t *one)
+{
+  uintmax_t r1, r0, b1, b0, n1, n0;
+  unsigned int i;
+  uintmax_t e;
+
+  b0 = bp[0];
+  b1 = bp[1];
+  n0 = np[0];
+  n1 = np[1];
+
+  r0 = one[0];
+  r1 = one[1];
+
+  for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
+    {
+      if (e & 1)
+        {
+          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
+          r1 = *r1m;
+        }
+      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
+      b1 = *r1m;
+    }
+  for (e = ep[1]; e > 0; e >>= 1)
+    {
+      if (e & 1)
+        {
+          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
+          r1 = *r1m;
+        }
+      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
+      b1 = *r1m;
+    }
+  *r1m = r1;
+  return r0;
+}
+
+static bool _GL_ATTRIBUTE_CONST
+millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
+             unsigned int k, uintmax_t one)
+{
+  uintmax_t y = powm (b, q, n, ni, one);
+
+  uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
+
+  if (y == one || y == nm1)
+    return true;
+
+  for (unsigned int i = 1; i < k; i++)
+    {
+      y = mulredc (y, y, n, ni);
+
+      if (y == nm1)
+        return true;
+      if (y == one)
+        return false;
+    }
+  return false;
+}
+
+static bool
+millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
+              const uintmax_t *qp, unsigned int k, const uintmax_t *one)
+{
+  uintmax_t y1, y0, nm1_1, nm1_0, r1m;
+
+  y0 = powm2 (&r1m, bp, qp, np, ni, one);
+  y1 = r1m;
+
+  if (y0 == one[0] && y1 == one[1])
+    return true;
+
+  sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
+
+  if (y0 == nm1_0 && y1 == nm1_1)
+    return true;
+
+  for (unsigned int i = 1; i < k; i++)
+    {
+      y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
+      y1 = r1m;
+
+      if (y0 == nm1_0 && y1 == nm1_1)
+        return true;
+      if (y0 == one[0] && y1 == one[1])
+        return false;
+    }
+  return false;
+}
+
+#if HAVE_GMP
+static bool
+mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
+                mpz_srcptr q, unsigned long int k)
+{
+  mpz_powm (y, x, q, n);
+
+  if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
+    return true;
+
+  for (unsigned long int i = 1; i < k; i++)
+    {
+      mpz_powm_ui (y, y, 2, n);
+      if (mpz_cmp (y, nm1) == 0)
+        return true;
+      if (mpz_cmp_ui (y, 1) == 0)
+        return false;
+    }
+  return false;
+}
+#endif
+
+/* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
+   have been observed.  The average seem to be about 2.  */
+static bool
+prime_p (uintmax_t n)
+{
+  int k;
+  bool is_prime;
+  uintmax_t a_prim, one, ni;
+  struct factors factors;
+
+  if (n <= 1)
+    return false;
+
+  /* We have already casted out small primes. */
+  if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
+    return true;
+
+  /* Precomputation for Miller-Rabin.  */
+  uintmax_t q = n - 1;
+  for (k = 0; (q & 1) == 0; k++)
+    q >>= 1;
+
+  uintmax_t a = 2;
+  binv (ni, n);                 /* ni <- 1/n mod B */
+  redcify (one, 1, n);
+  addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
+
+  /* Perform a Miller-Rabin test, finds most composites quickly.  */
+  if (!millerrabin (n, ni, a_prim, q, k, one))
+    return false;
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      factor (0, n - 1, &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              is_prime = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != 
one;
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        return true;
+
+      a += primes_diff[r];      /* Establish new base.  */
+
+      /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
+         on most processors, since it avoids udiv_qrnnd.  If we go down the
+         udiv_qrnnd_preinv path, this code should be replaced.  */
+      {
+        uintmax_t dummy, s1, s0;
+        umul_ppmm (s1, s0, one, a);
+        if (LIKELY (s1 == 0))
+          a_prim = s0 % n;
+        else
+          udiv_qrnnd (dummy, a_prim, s1, s0, n);
+      }
+
+      if (!millerrabin (n, ni, a_prim, q, k, one))
+        return false;
+    }
+
+  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
+  abort ();
+}
+
+static bool
+prime2_p (uintmax_t n1, uintmax_t n0)
+{
+  uintmax_t q[2], nm1[2];
+  uintmax_t a_prim[2];
+  uintmax_t one[2];
+  uintmax_t na[2];
+  uintmax_t ni;
+  unsigned int k;
+  struct factors factors;
+
+  if (n1 == 0)
+    return prime_p (n0);
+
+  nm1[1] = n1 - (n0 == 0);
+  nm1[0] = n0 - 1;
+  if (nm1[0] == 0)
+    {
+      count_trailing_zeros (k, nm1[1]);
+
+      q[0] = nm1[1] >> k;
+      q[1] = 0;
+      k += W_TYPE_SIZE;
+    }
+  else
+    {
+      count_trailing_zeros (k, nm1[0]);
+      rsh2 (q[1], q[0], nm1[1], nm1[0], k);
+    }
+
+  uintmax_t a = 2;
+  binv (ni, n0);
+  redcify2 (one[1], one[0], 1, n1, n0);
+  addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
+
+  /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
+  na[0] = n0;
+  na[1] = n1;
+
+  if (!millerrabin2 (na, ni, a_prim, q, k, one))
+    return false;
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      factor (nm1[1], nm1[0], &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      bool is_prime;
+      uintmax_t e[2], y[2];
+
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          if (factors.plarge[1])
+            {
+              uintmax_t pi;
+              binv (pi, factors.plarge[0]);
+              e[0] = pi * nm1[0];
+              e[1] = 0;
+              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
+              is_prime = (y[0] != one[0] || y[1] != one[1]);
+            }
+          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              /* FIXME: We always have the factor 2. Do we really need to 
handle it
+                 here? We have done the same powering as part of millerrabin. 
*/
+              if (factors.p[i] == 2)
+                rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
+              else
+                divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
+              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
+              is_prime = (y[0] != one[0] || y[1] != one[1]);
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        return true;
+
+      a += primes_diff[r];      /* Establish new base.  */
+      redcify2 (a_prim[1], a_prim[0], a, n1, n0);
+
+      if (!millerrabin2 (na, ni, a_prim, q, k, one))
+        return false;
+    }
+
+  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
+  abort ();
+}
+
+#if HAVE_GMP
+static bool
+mp_prime_p (mpz_t n)
+{
+  bool is_prime;
+  mpz_t q, a, nm1, tmp;
+  struct mp_factors factors;
+
+  if (mpz_cmp_ui (n, 1) <= 0)
+    return false;
+
+  /* We have already casted out small primes. */
+  if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
+    return true;
+
+  mpz_inits (q, a, nm1, tmp, NULL);
+
+  /* Precomputation for Miller-Rabin.  */
+  mpz_sub_ui (nm1, n, 1);
+
+  /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
+  unsigned long int k = mpz_scan1 (nm1, 0);
+  mpz_tdiv_q_2exp (q, nm1, k);
+
+  mpz_set_ui (a, 2);
+
+  /* Perform a Miller-Rabin test, finds most composites quickly.  */
+  if (!mp_millerrabin (n, nm1, a, tmp, q, k))
+    {
+      is_prime = false;
+      goto ret2;
+    }
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      mpz_set (tmp, nm1);
+      mp_factor (tmp, &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              mpz_divexact (tmp, nm1, factors.p[i]);
+              mpz_powm (tmp, a, tmp, n);
+              is_prime = mpz_cmp_ui (tmp, 1) != 0;
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        goto ret1;
+
+      mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
+
+      if (!mp_millerrabin (n, nm1, a, tmp, q, k))
+        {
+          is_prime = false;
+          goto ret1;
+        }
+    }
+
+  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
+  abort ();
+
+ ret1:
+  if (flag_prove_primality)
+    mp_factor_clear (&factors);
+ ret2:
+  mpz_clears (q, a, nm1, tmp, NULL);
+
+  return is_prime;
+}
+#endif
+
+static void
+factor_using_pollard_rho (uintmax_t n, unsigned long int a,
+                          struct factors *factors)
+{
+  uintmax_t x, z, y, P, t, ni, g;
+
+  unsigned long int k = 1;
+  unsigned long int l = 1;
+
+  redcify (P, 1, n);
+  addmod (x, P, P, n);          /* i.e., redcify(2) */
+  y = z = x;
+
+  while (n != 1)
+    {
+      assert (a < n);
+
+      binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
+
+      for (;;)
+        {
+          do
+            {
+              x = mulredc (x, x, n, ni);
+              addmod (x, x, a, n);
+
+              submod (t, z, x, n);
+              P = mulredc (P, t, n, ni);
+
+              if (k % 32 == 1)
+                {
+                  if (gcd_odd (P, n) != 1)
+                    goto factor_found;
+                  y = x;
+                }
+            }
+          while (--k != 0);
+
+          z = x;
+          k = l;
+          l = 2 * l;
+          for (unsigned long int i = 0; i < k; i++)
+            {
+              x = mulredc (x, x, n, ni);
+              addmod (x, x, a, n);
+            }
+          y = x;
+        }
+
+    factor_found:
+      do
+        {
+          y = mulredc (y, y, n, ni);
+          addmod (y, y, a, n);
+
+          submod (t, z, y, n);
+          g = gcd_odd (t, n);
+        }
+      while (g == 1);
+
+      n = n / g;
+
+      if (!prime_p (g))
+        factor_using_pollard_rho (g, a + 1, factors);
+      else
+        factor_insert (factors, g);
+
+      if (prime_p (n))
+        {
+          factor_insert (factors, n);
+          break;
+        }
+
+      x = x % n;
+      z = z % n;
+      y = y % n;
+    }
+}
+
+static void
+factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
+                           struct factors *factors)
+{
+  uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
+
+  unsigned long int k = 1;
+  unsigned long int l = 1;
+
+  redcify2 (P1, P0, 1, n1, n0);
+  addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
+  y1 = z1 = x1;
+  y0 = z0 = x0;
+
+  while (n1 != 0 || n0 != 1)
+    {
+      binv (ni, n0);
+
+      for (;;)
+        {
+          do
+            {
+              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
+              x1 = r1m;
+              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
+
+              submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
+              P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
+              P1 = r1m;
+
+              if (k % 32 == 1)
+                {
+                  g0 = gcd2_odd (&g1, P1, P0, n1, n0);
+                  if (g1 != 0 || g0 != 1)
+                    goto factor_found;
+                  y1 = x1; y0 = x0;
+                }
+            }
+          while (--k != 0);
+
+          z1 = x1; z0 = x0;
+          k = l;
+          l = 2 * l;
+          for (unsigned long int i = 0; i < k; i++)
+            {
+              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
+              x1 = r1m;
+              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
+            }
+          y1 = x1; y0 = x0;
+        }
+
+    factor_found:
+      do
+        {
+          y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
+          y1 = r1m;
+          addmod2 (y1, y0, y1, y0, 0, a, n1, n0);
+
+          submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
+          g0 = gcd2_odd (&g1, t1, t0, n1, n0);
+        }
+      while (g1 == 0 && g0 == 1);
+
+      if (g1 == 0)
+        {
+          /* The found factor is one word. */
+          divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
+
+          if (!prime_p (g0))
+            factor_using_pollard_rho (g0, a + 1, factors);
+          else
+            factor_insert (factors, g0);
+        }
+      else
+        {
+          /* The found factor is two words.  This is highly unlikely, thus hard
+             to trigger.  Please be careful before you change this code!  */
+          uintmax_t ginv;
+
+          binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
+          n0 = ginv * n0;       /* fit one word, we can compute the quotient */
+          n1 = 0;               /* modulo B, ignoring the high divisor word. */
+
+          if (!prime2_p (g1, g0))
+            factor_using_pollard_rho2 (g1, g0, a + 1, factors);
+          else
+            factor_insert_large (factors, g1, g0);
+        }
+
+      if (n1 == 0)
+        {
+          if (prime_p (n0))
+            {
+              factor_insert (factors, n0);
+              break;
+            }
+
+          factor_using_pollard_rho (n0, a, factors);
+          return;
+        }
+
+      if (prime2_p (n1, n0))
+        {
+          factor_insert_large (factors, n1, n0);
+          break;
+        }
+
+      x0 = mod2 (&x1, x1, x0, n1, n0);
+      z0 = mod2 (&z1, z1, z0, n1, n0);
+      y0 = mod2 (&y1, y1, y0, n1, n0);
+    }
+}
+
+#if HAVE_GMP
+static void
+mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
+                             struct mp_factors *factors)
+{
+  mpz_t x, z, y, P;
+  mpz_t t, t2;
+  unsigned long long int k, l;
+
+  if (flag_verbose > 0)
+    {
+      printf ("[pollard-rho (%lu)] ", a);
+    }
+
+  mpz_inits (t, t2, NULL);
+  mpz_init_set_si (y, 2);
+  mpz_init_set_si (x, 2);
+  mpz_init_set_si (z, 2);
+  mpz_init_set_ui (P, 1);
+  k = 1;
+  l = 1;
+
+  while (mpz_cmp_ui (n, 1) != 0)
+    {
+      for (;;)
+        {
+          do
+            {
+              mpz_mul (t, x, x);
+              mpz_mod (x, t, n);
+              mpz_add_ui (x, x, a);
+
+              mpz_sub (t, z, x);
+              mpz_mul (t2, P, t);
+              mpz_mod (P, t2, n);
+
+              if (k % 32 == 1)
+                {
+                  mpz_gcd (t, P, n);
+                  if (mpz_cmp_ui (t, 1) != 0)
+                    goto factor_found;
+                  mpz_set (y, x);
+                }
+            }
+          while (--k != 0);
+
+          mpz_set (z, x);
+          k = l;
+          l = 2 * l;
+          for (unsigned long long int i = 0; i < k; i++)
+            {
+              mpz_mul (t, x, x);
+              mpz_mod (x, t, n);
+              mpz_add_ui (x, x, a);
+            }
+          mpz_set (y, x);
+        }
+
+    factor_found:
+      do
+        {
+          mpz_mul (t, y, y);
+          mpz_mod (y, t, n);
+          mpz_add_ui (y, y, a);
+
+          mpz_sub (t, z, y);
+          mpz_gcd (t, t, n);
+        }
+      while (mpz_cmp_ui (t, 1) == 0);
+
+      mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
+
+      if (!mp_prime_p (t))
+        {
+          if (flag_verbose > 0)
+            {
+              printf ("[composite factor--restarting pollard-rho] ");
+            }
+          mp_factor_using_pollard_rho (t, a + 1, factors);
+        }
+      else
+        {
+          mp_factor_insert (factors, t);
+        }
+
+      if (mp_prime_p (n))
+        {
+          mp_factor_insert (factors, n);
+          break;
+        }
+
+      mpz_mod (x, x, n);
+      mpz_mod (z, z, n);
+      mpz_mod (y, y, n);
+    }
+
+  mpz_clears (P, t2, t, z, x, y, NULL);
+}
+#endif
+
+/* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
+   algorithm is replaced, consider also returning the remainder. */
+static uintmax_t _GL_ATTRIBUTE_CONST
+isqrt (uintmax_t n)
+{
+  uintmax_t x;
+  unsigned c;
+  if (n == 0)
+    return 0;
+
+  count_leading_zeros (c, n);
+
+  /* Make x > sqrt(n). This will be invariant through the loop. */
+  x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
+
+  for (;;)
+    {
+      uintmax_t y = (x + n/x) / 2;
+      if (y >= x)
+        return x;
+
+      x = y;
+    }
+}
+
+static uintmax_t _GL_ATTRIBUTE_CONST
+isqrt2 (uintmax_t nh, uintmax_t nl)
+{
+  unsigned int shift;
+  uintmax_t x;
+
+  /* Ensures the remainder fits in an uintmax_t. */
+  assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
+
+  if (nh == 0)
+    return isqrt (nl);
+
+  count_leading_zeros (shift, nh);
+  shift &= ~1;
+
+  /* Make x > sqrt(n) */
+  x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
+  x <<= (W_TYPE_SIZE - shift) / 2;
+
+  /* Do we need more than one iteration? */
+  for (;;)
+    {
+      uintmax_t q, r, y;
+      udiv_qrnnd (q, r, nh, nl, x);
+      y = (x + q) / 2;
+
+      if (y >= x)
+        {
+          uintmax_t hi, lo;
+          umul_ppmm (hi, lo, x + 1, x + 1);
+          assert (gt2 (hi, lo, nh, nl));
+
+          umul_ppmm (hi, lo, x, x);
+          assert (ge2 (nh, nl, hi, lo));
+          sub_ddmmss (hi, lo, nh, nl, hi, lo);
+          assert (hi == 0);
+
+          return x;
+        }
+
+      x = y;
+    }
+}
+
+/* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
+#define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
+#define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
+#define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
+#define MAGIC11 0x23b
+
+/* Returns the square root if the input is a square, otherwise 0. */
+static uintmax_t _GL_ATTRIBUTE_CONST
+is_square (uintmax_t x)
+{
+  /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
+     computing the square root. */
+  if (((MAGIC64 >> (x & 63)) & 1)
+      && ((MAGIC63 >> (x % 63)) & 1)
+      /* Both 0 and 64 are squares mod (65) */
+      && ((MAGIC65 >> ((x % 65) & 63)) & 1)
+      && ((MAGIC11 >> (x % 11) & 1)))
+    {
+      uintmax_t r = isqrt (x);
+      if (r*r == x)
+        return r;
+    }
+  return 0;
+}
+
+static const unsigned short invtab[] =
+  {
+    0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
+    0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
+    0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
+    0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
+    0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
+    0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
+    0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
+    0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
+    0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
+    0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
+    0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
+    0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
+    0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
+    0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
+    0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
+    0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
+  };
+
+/* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
+   that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
+#define div_smallq(q, r, u, d)                                          \
+  do {                                                                  \
+    if (0 && (u) / 0x40 < (d))                                          \
+      {                                                                 \
+        int _cnt;                                                       \
+        uintmax_t _dinv, _mask, _q, _r;                                 \
+        count_leading_zeros (_cnt, (d));                                \
+                                                                        \
+        _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt))                \
+                       - (1 << (8 - 1))];                               \
+                                                                        \
+        _r = (u);                                                       \
+        _q = _r * _dinv >> (W_TYPE_SIZE + 8 - _cnt);                    \
+        _r -= _q*(d);                                                   \
+                                                                        \
+        _mask = -(uintmax_t) (_r >= (d));                               \
+        (r) = _r - (_mask & (d));                                       \
+        (q) = _q - _mask;                                               \
+        assert ( (q) * (d) + (r) == u);                                 \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        uintmax_t _q = (u) / (d);                                       \
+        (r) = (u) - _q * (d);                                           \
+        (q) = _q;                                                       \
+      }                                                                 \
+  } while (0)
+
+/* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
+   = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
+   with 3025 = 55^2.
+
+   Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
+   representing G_0 = (-55, 2*4652, 8653).
+
+   In the notation of the paper:
+
+   S_{-1} = 55, S_0 = 8653, R_0 = 4652
+
+   Put
+
+     t_0 = floor([q_0 + R_0] / S0) = 1
+     R_1 = t_0 * S_0 - R_0 = 4001
+     S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
+*/
+
+/* Multipliers, in order of efficiency:
+   0.7268  3*5*7*11 = 1155 = 3 (mod 4)
+   0.7317  3*5*7    =  105 = 1
+   0.7820  3*5*11   =  165 = 1
+   0.7872  3*5      =   15 = 3
+   0.8101  3*7*11   =  231 = 3
+   0.8155  3*7      =   21 = 1
+   0.8284  5*7*11   =  385 = 1
+   0.8339  5*7      =   35 = 3
+   0.8716  3*11     =   33 = 1
+   0.8774  3        =    3 = 3
+   0.8913  5*11     =   55 = 3
+   0.8972  5        =    5 = 1
+   0.9233  7*11     =   77 = 1
+   0.9295  7        =    7 = 3
+   0.9934  11       =   11 = 3
+*/
+#define QUEUE_SIZE 50
+
+#if STAT_SQUFOF
+# define Q_FREQ_SIZE 50
+/* Element 0 keeps the total */
+static unsigned int q_freq[Q_FREQ_SIZE + 1];
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
+static void
+factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
+{
+  /* Uses algorithm and notation from
+
+     SQUARE FORM FACTORIZATION
+     JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
+
+     http://homes.cerias.purdue.edu/~ssw/squfof.pdf
+   */
+
+  static const unsigned int multipliers_1[] =
+    { /* = 1 (mod 4) */
+      105, 165, 21, 385, 33, 5, 77, 1, 0
+    };
+  static const unsigned int multipliers_3[] =
+    { /* = 3 (mod 4) */
+      1155, 15, 231, 35, 3, 55, 7, 11, 0
+    };
+
+  uintmax_t S;
+  const unsigned int *m;
+
+  struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
+
+  S = isqrt2 (n1, n0);
+
+  if (n0 == S * S)
+    {
+      uintmax_t p1, p0;
+
+      umul_ppmm (p1, p0, S, S);
+      assert (p0 == n0);
+
+      if (n1 == p1)
+        {
+          if (prime_p (S))
+            factor_insert_multiplicity (factors, S, 2);
+          else
+            {
+              struct factors f;
+
+              f.nfactors = 0;
+              factor_using_squfof (0, S, &f);
+              /* Duplicate the new factors */
+              for (unsigned int i = 0; i < f.nfactors; i++)
+                factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
+            }
+          return;
+        }
+    }
+
+  /* Select multipliers so we always get n * mu = 3 (mod 4) */
+  for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
+       *m; m++)
+    {
+      uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
+      unsigned int i;
+      unsigned int mu = *m;
+      unsigned int qpos = 0;
+
+      assert (mu * n0 % 4 == 3);
+
+      /* In the notation of the paper, with mu * n == 3 (mod 4), we
+         get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
+         I understand it, the necessary bound is 4 \mu^3 < n, or 32
+         mu^3 < n.
+
+         However, this seems insufficient: With n = 37243139 and mu =
+         105, we get a trivial factor, from the square 38809 = 197^2,
+         without any corresponding Q earlier in the iteration.
+
+         Requiring 64 mu^3 < n seems sufficient. */
+      if (n1 == 0)
+        {
+          if ((uintmax_t) mu*mu*mu >= n0 / 64)
+            continue;
+        }
+      else
+        {
+          if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
+            continue;
+        }
+      umul_ppmm (Dh, Dl, n0, mu);
+      Dh += n1 * mu;
+
+      assert (Dl % 4 != 1);
+
+      S = isqrt2 (Dh, Dl);
+
+      Q1 = 1;
+      P = S;
+
+      /* Square root remainder fits in one word, so ignore high part. */
+      Q = Dl - P*P;
+      /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
+      L = isqrt (2*S);
+      B = 2*L;
+      L1 = mu * 2 * L;
+
+      /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
+         4 D. */
+
+      for (i = 0; i <= B; i++)
+        {
+          uintmax_t q, P1, t, r;
+
+          div_smallq (q, r, S+P, Q);
+          P1 = S - r;   /* P1 = q*Q - P */
+
+#if STAT_SQUFOF
+          assert (q > 0);
+          q_freq[0]++;
+          q_freq[MIN(q, Q_FREQ_SIZE)]++;
+#endif
+
+          if (Q <= L1)
+            {
+              uintmax_t g = Q;
+
+              if ( (Q & 1) == 0)
+                g /= 2;
+
+              g /= gcd_odd (g, mu);
+
+              if (g <= L)
+                {
+                  if (qpos >= QUEUE_SIZE)
+                    {
+                      fprintf (stderr, "squfof queue overflow.\n");
+                      exit (EXIT_FAILURE);
+                    }
+                  queue[qpos].Q = g;
+                  queue[qpos].P = P % g;
+                  qpos++;
+                }
+            }
+
+          /* I think the difference can be either sign, but mod
+             2^W_TYPE_SIZE arithmetic should be fine. */
+          t = Q1 + q * (P - P1);
+          Q1 = Q;
+          Q = t;
+          P = P1;
+
+          if ( (i & 1) == 0)
+            {
+              uintmax_t r = is_square (Q);
+              if (r)
+                {
+                  unsigned int j;
+
+                  for (j = 0; j < qpos; j++)
+                    {
+                      if (queue[j].Q == r)
+                        {
+                          if (r == 1)
+                            /* Traversed entire cycle. */
+                            goto next_multiplier;
+
+                          /* Need the absolute value for divisibility test. */
+                          if (P >= queue[j].P)
+                            t = P - queue[j].P;
+                          else
+                            t = queue[j].P - P;
+                          if (t % r == 0)
+                            {
+                              /* Delete entries up to and including entry
+                                 j, which matched. */
+                              memmove (queue, queue + j + 1,
+                                       (qpos - j - 1) * sizeof (queue[0]));
+                              qpos -= (j + 1);
+                            }
+                          goto next_i;
+                        }
+                    }
+
+                  /* We have found a square form, which should give a
+                     factor. */
+                  Q1 = r;
+                  assert (S >= P); /* What signs are possible? */
+                  P += r * ((S - P) / r);
+
+                  /* Note: Paper says (N - P*P) / Q1, that seems incorrect
+                     for the case D = 2N. */
+                  /* Compute Q = (D - P*P) / Q1, but we need double
+                     precision. */
+                  {
+                    uintmax_t hi, lo, rem;
+                    umul_ppmm (hi, lo, P, P);
+                    sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
+                    udiv_qrnnd (Q, rem, hi, lo, Q1);
+                    assert (rem == 0);
+                  }
+
+                  for (;;)
+                    {
+                      uintmax_t r;
+
+                      /* Note: There appears to by a typo in the paper,
+                         Step 4a in the algorithm description says q <--
+                         floor([S+P]/\hat Q), but looking at the equations
+                         in Sec. 3.1, it should be q <-- floor([S+P] / Q).
+                         (In this code, \hat Q is Q1). */
+                      div_smallq (q, r, S+P, Q);
+                      P1 = S - r;       /* P1 = q*Q - P */
+
+#if STAT_SQUFOF
+                      q_freq[0]++;
+                      q_freq[MIN(q, Q_FREQ_SIZE)]++;
+#endif
+                      if (P == P1)
+                        break;
+                      t = Q1 + q * (P - P1);
+                      Q1 = Q;
+                      Q = t;
+                      P = P1;
+                    }
+
+                  if ( (Q & 1) == 0)
+                    Q /= 2;
+                  Q /= gcd_odd (Q, mu);
+
+                  assert (Q > 1 && (n1 || Q < n0));
+
+                  if (prime_p (Q))
+                    factor_insert (factors, Q);
+                  else
+                    factor_using_squfof (0, Q, factors);
+
+                  divexact_21 (n1, n0, n1, n0, Q);
+
+                  if (prime2_p (n1, n0))
+                    factor_insert_large (factors, n1, n0);
+                  else
+                    {
+                      if (n1 == 0)
+                        factor_using_pollard_rho (n0, 1, factors);
+                      else
+                        factor_using_squfof (n1, n0, factors);
+                    }
+
+                  return;
+                }
+            }
+        next_i:
+          ;
+        }
+    next_multiplier:
+      ;
+    }
+  fprintf (stderr, "squfof failed.\n");
+  exit (EXIT_FAILURE);
+}
+
+static void
+factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
+{
+  factors->nfactors = 0;
+  factors->plarge[1] = 0;
+
+  if (t1 == 0 && t0 < 2)
+    return;
+
+  t0 = factor_using_division (&t1, t1, t0, factors);
+
+  if (t1 == 0)
+    {
+      if (t0 != 1)
+        {
+          if (prime_p (t0))
+            factor_insert (factors, t0);
+          else if (alg == ALG_POLLARD_RHO)
+            factor_using_pollard_rho (t0, 1, factors);
+          else
+            factor_using_squfof (0, t0, factors);
+        }
+    }
+  else
+    {
+      if (prime2_p (t1, t0))
+        factor_insert_large (factors, t1, t0);
+      else if (alg == ALG_POLLARD_RHO)
+        factor_using_pollard_rho2 (t1, t0, 1, factors);
+      else
+        factor_using_squfof (t1, t0, factors);
+    }
+}
+
+#if HAVE_GMP
+static void
+mp_factor (mpz_t t, struct mp_factors *factors)
+{
+  mp_factor_init (factors);
+
+  if (mpz_sgn (t) != 0)
+    {
+      mp_factor_using_division (t, factors);
+
+      if (mpz_cmp_ui (t, 1) != 0)
+        {
+          if (flag_verbose > 0)
+            {
+              printf ("[is number prime?] ");
+            }
+          if (mp_prime_p (t))
+            mp_factor_insert (factors, t);
+          else
+            mp_factor_using_pollard_rho (t, 1, factors);
+        }
+    }
+}
+#endif
+
+static int
+strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
+{
+  int errcode;
+  unsigned int lo_carry;
+  uintmax_t hi, lo;
+
+  errcode = -1;
+
+  hi = lo = 0;
+  for (;;)
+    {
+      unsigned int c = *s++;
+      if (c == 0)
+        break;
+
+      if (UNLIKELY (c < '0' || c > '9'))
+        {
+          errcode = -1;
+          break;
+        }
+      c -= '0';
+
+      errcode = 0;              /* we've seen at least one valid digit */
+
+      if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
+        {
+          errcode = -1; /* overflow */
+          break;
+        }
+      hi = 10 * hi;
+
+      lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
+      lo_carry += 10 * lo < 2 * lo;
+
+      lo = 10 * lo;
+      lo += c;
+
+      lo_carry += lo < c;
+      hi += lo_carry;
+      if (UNLIKELY (hi < lo_carry))
+        {
+          errcode = -1; /* overflow */
+          break;
+        }
+    }
+
+  *hip = hi;
+  *lop = lo;
+
+  return errcode;
+}
+
+static void
+print_uintmaxes (uintmax_t t1, uintmax_t t0)
+{
+  uintmax_t q, r;
+
+  if (t1 == 0)
+    printf ("%ju", t0);
+  else
+    {
+      /* Use very plain code here since it seems hard to write fast code
+         without assuming a specific word size.  */
+      q = t1 / 1000000000;
+      r = t1 % 1000000000;
+      udiv_qrnnd (t0, r, r, t0, 1000000000);
+      print_uintmaxes (q, t0);
+      printf ("%09u", (int) r);
+    }
+}
+
+static void
+factor_one (const char *input)
+{
+  uintmax_t t1, t0;
+  int errcode;
+
+  /* Try converting the number to one or two words.  If it fails, use GMP or
+     print an error message.  The 2nd condition checks that the most
+     significant bit of the two-word number is clear, in a typesize neutral
+     way.  */
+  errcode = strto2uintmax (&t1, &t0, input);
+  if (errcode == 0 && ((t1 << 1) >> 1) == t1)
+    {
+      struct factors factors;
+
+      print_uintmaxes (t1, t0);
+      printf (":");
+
+      factor (t1, t0, &factors);
+
+      for (unsigned int j = 0; j < factors.nfactors; j++)
+        for (unsigned int k = 0; k < factors.e[j]; k++)
+          printf (" %ju", factors.p[j]);
+
+      if (factors.plarge[1])
+        {
+          printf (" ");
+          print_uintmaxes (factors.plarge[1], factors.plarge[0]);
+        }
+      puts ("");
+    }
+  else
+    {
+#if HAVE_GMP
+      mpz_t t;
+      struct mp_factors factors;
+
+      mpz_init_set_str (t, input, 10);
+
+      gmp_printf ("%Zd:", t);
+      mp_factor (t, &factors);
+
+      for (unsigned int j = 0; j < factors.nfactors; j++)
+        for (unsigned int k = 0; k < factors.e[j]; k++)
+          gmp_printf (" %Zd", factors.p[j]);
+
+      mp_factor_clear (&factors);
+      mpz_clear (t);
+      puts ("");
+#else
+      fprintf (stderr, "error: number %s not parsable or too large\n", input);
+      exit (EXIT_FAILURE);
+#endif
+    }
+}
+
+struct inbuf
+{
+  char *buf;
+  size_t alloc;
+};
+
+/* Read white-space delimited items. Return 1 on success, 0 on EOF.
+   Exit on I/O errors. */
+int
+read_item (struct inbuf *bufstruct)
+{
+  int c;
+  size_t i;
+  char *buf = bufstruct->buf;
+
+  do
+    c = getchar_unlocked ();
+  while (isspace (c));
+
+  for (i = 0; !isspace(c); i++)
+    {
+      if (c < 0)
+        {
+          if (ferror (stdin))
+            {
+              fprintf (stderr, "read error on stdin: %s\n", strerror(errno));
+              exit (EXIT_FAILURE);
+            }
+          if (i == 0)
+            return 0;
+          else
+            break;
+        }
+
+      if (UNLIKELY (bufstruct->alloc <= i + 1)) /* +1 byte for terminating NUL 
*/
+        {
+          bufstruct->alloc = bufstruct->alloc * 5 / 4 + 1;
+          bufstruct->buf = realloc (bufstruct->buf, bufstruct->alloc);
+          buf = bufstruct->buf;
+        }
+
+      buf[i] = c;
+      c = getchar_unlocked ();
+    }
+
+  buf[i] = '\0';
+  return 1;
+}
+
+int
+main (int argc, char *argv[])
+{
+  int c;
+
+  alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
+
+  while ( (c = getopt(argc, argv, "svw")) != -1)
+    switch (c)
+      {
+      case 's':
+        alg = ALG_SQUFOF;
+        break;
+      case 'v':
+        flag_verbose = 1;
+        break;
+      case 'w':
+        flag_prove_primality = 0;
+        break;
+      case '?':
+        printf ("Usage: %s [-s] number ...\n", argv[0]);
+        return EXIT_FAILURE;
+      }
+#if STAT_SQUFOF
+  if (alg == ALG_SQUFOF)
+    memset (q_freq, 0, sizeof (q_freq));
+#endif
+
+  if (optind < argc)
+    for (int i = optind; i < argc; i++)
+      factor_one (argv[i]);
+  else
+    {
+      struct inbuf bufstruct;
+      bufstruct.alloc = 50;     /* enough unless HAVE_GMP */
+      bufstruct.buf = malloc (bufstruct.alloc);
+      while (read_item (&bufstruct))
+        factor_one (bufstruct.buf);
+    }
+
+#if STAT_SQUFOF
+  if (alg == ALG_SQUFOF && q_freq[0] > 0)
+    {
+      double acc_f;
+      printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
+      for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
+        {
+          double f = (double) q_freq[i] / q_freq[0];
+          acc_f += f;
+          printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
+                  100.0 * f, 100.0 * acc_f);
+        }
+    }
+#endif
+
+  exit (EXIT_SUCCESS);
+}
diff --git a/src/longlong.h b/src/longlong.h
new file mode 100644
index 0000000..84e32ee
--- /dev/null
+++ b/src/longlong.h
@@ -0,0 +1,2156 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+
+Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
+2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it under the
+terms of the GNU Lesser General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this file.  If not, see http://www.gnu.org/licenses/.  */
+
+/* You have to define the following before including this file:
+
+   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UHWtype -- An unsigned type, at least half the size of UWtype
+   UDWtype -- An unsigned type, at least twice as large a UWtype
+   W_TYPE_SIZE -- size in bits of UWtype
+
+   SItype, USItype -- Signed and unsigned 32 bit types
+   DItype, UDItype -- Signed and unsigned 64 bit types
+
+   On a 32 bit machine UWtype should typically be USItype;
+   on a 64 bit machine, UWtype should typically be UDItype.
+
+   Optionally, define:
+
+   LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
+   NO_ASM -- Disable inline asm
+
+
+   CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
+   need to include gmp.h and gmp-impl.h, or certain things might not work as
+   expected.
+*/
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* This is used to make sure no undesirable sharing between different libraries
+   that use this file takes place.  */
+#ifndef __MPN
+#define __MPN(x) __##x
+#endif
+
+/* Define auxiliary asm macros.
+
+   1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+   UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
+   word product in HIGH_PROD and LOW_PROD.
+
+   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+   UDWtype product.  This is just a variant of umul_ppmm.
+
+   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator) divides a UDWtype, composed by the UWtype integers
+   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
+   than DENOMINATOR for correct operation.  If, in addition, the most
+   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+   UDIV_NEEDS_NORMALIZATION is defined to 1.
+
+   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
+   is rounded towards 0.
+
+   5) count_leading_zeros(count, x) counts the number of zero-bits from the
+   msb to the first non-zero bit in the UWtype X.  This is the number of
+   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
+   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+
+   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+   from the least significant end.
+
+   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.
+
+   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.
+
+   If any of these macros are left undefined for a particular CPU,
+   C macros are used.
+
+
+   Notes:
+
+   For add_ssaaaa the two high and two low addends can both commute, but
+   unfortunately gcc only supports one "%" commutative in each asm block.
+   This has always been so but is only documented in recent versions
+   (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
+   compiler error in certain rare circumstances.
+
+   Apparently it was only the last "%" that was ever actually respected, so
+   the code has been updated to leave just that.  Clearly there's a free
+   choice whether high or low should get it, if there's a reason to favour
+   one over the other.  Also obviously when the constraints on the two
+   operands are identical there's no benefit to the reloader in any "%" at
+   all.
+
+   */
+
+/* The CPUs come in alphabetical order below.
+
+   Please add support for more CPUs here, or improve the current support
+   for the CPUs below!  */
+
+
+/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
+   3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
+   Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
+   __builtin_ctzll.
+
+   These builtins are only used when we check what code comes out, on some
+   chips they're merely libgcc calls, where we will instead want an inline
+   in that case (either asm or generic C).
+
+   These builtins are better than an asm block of the same insn, since an
+   asm block doesn't give gcc any information about scheduling or resource
+   usage.  We keep an asm block for use on prior versions of gcc though.
+
+   For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
+   it's not used (for count_leading_zeros) because it generally gives extra
+   code to ensure the result is 0 when the input is 0, which we don't need
+   or want.  */
+
+#ifdef _LONG_LONG_LIMB
+#define count_leading_zeros_gcc_clz(count,x)    \
+  do {                                          \
+    ASSERT ((x) != 0);                          \
+    (count) = __builtin_clzll (x);              \
+  } while (0)
+#else
+#define count_leading_zeros_gcc_clz(count,x)    \
+  do {                                          \
+    ASSERT ((x) != 0);                          \
+    (count) = __builtin_clzl (x);               \
+  } while (0)
+#endif
+
+#ifdef _LONG_LONG_LIMB
+#define count_trailing_zeros_gcc_ctz(count,x)   \
+  do {                                          \
+    ASSERT ((x) != 0);                          \
+    (count) = __builtin_ctzll (x);              \
+  } while (0)
+#else
+#define count_trailing_zeros_gcc_ctz(count,x)   \
+  do {                                          \
+    ASSERT ((x) != 0);                          \
+    (count) = __builtin_ctzl (x);               \
+  } while (0)
+#endif
+
+
+/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
+   don't need to be under !NO_ASM */
+#if ! defined (NO_ASM)
+
+#if defined (__alpha) && W_TYPE_SIZE == 64
+/* Most alpha-based machines, except Cray systems. */
+#if defined (__GNUC__)
+#if __GMP_GNUC_PREREQ (3,3)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    (ph) = __builtin_alpha_umulh (__m0, __m1);                         \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#else
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("umulh %r1,%2,%0"                                         \
+            : "=r" (ph)                                                \
+            : "%rJ" (m0), "rI" (m1));                                  \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#endif
+#define UMUL_TIME 18
+#else /* ! __GNUC__ */
+#include <machine/builtins.h>
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    (ph) = __UMULH (m0, m1);                                           \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#endif
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;                                                    \
+    __di = __MPN(invert_limb) (d);                                     \
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                         \
+  } while (0)
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 220
+#endif /* LONGLONG_STANDALONE */
+
+/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
+   always goes into libgmp.so, even when not actually used.  */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
+#define count_leading_zeros(COUNT,X) \
+  __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
+#define count_trailing_zeros(COUNT,X) \
+  __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
+#endif /* clz/ctz using cix */
+
+#if ! defined (count_leading_zeros)                             \
+  && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
+/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
+   "$31" is written explicitly in the asm, since an "r" constraint won't
+   select reg 31.  There seems no need to worry about "r31" syntax for cray,
+   since gcc itself (pre-release 3.4) emits just $31 in various places.  */
+#define ALPHA_CMPBGE_0(dst, src)                                        \
+  do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
+/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
+   them, locating the highest non-zero byte.  A second __clz_tab lookup
+   counts the leading zero bits in that byte, giving the result.  */
+#define count_leading_zeros(count, x)                                   \
+  do {                                                                  \
+    UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
+    ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
+    __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
+    __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
+    __clz__x >>= __clz__b;                                              \
+    __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
+    __clz__b = 65 - __clz__b;                                           \
+    (count) = __clz__b - __clz__c;                                      \
+  } while (0)
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#endif /* clz using cmpbge */
+
+#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
+#if HAVE_ATTRIBUTE_CONST
+long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
+#else
+long __MPN(count_leading_zeros) (UDItype);
+#endif
+#define count_leading_zeros(count, x) \
+  ((count) = __MPN(count_leading_zeros) (x))
+#endif /* clz using mpn */
+#endif /* __alpha */
+
+#if defined (__AVR) && W_TYPE_SIZE == 8
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    unsigned short __p = (unsigned short) (m0) * (m1);                 \
+    (ph) = __p >> 8;                                                   \
+    (pl) = __p;                                                                
\
+  } while (0)
+#endif /* AVR */
+
+#if defined (_CRAY) && W_TYPE_SIZE == 64
+#include <intrinsics.h>
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 220
+long __MPN(count_leading_zeros) (UDItype);
+#define count_leading_zeros(count, x) \
+  ((count) = _leadz ((UWtype) (x)))
+#if defined (_CRAYIEEE)                /* I.e., Cray T90/ieee, T3D, and T3E */
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    (ph) = _int_mult_upper (m0, m1);                                   \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;                                                    \
+    __di = __MPN(invert_limb) (d);                                     \
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                         \
+  } while (0)
+#endif /* LONGLONG_STANDALONE */
+#endif /* _CRAYIEEE */
+#endif /* _CRAY */
+
+#if defined (__ia64) && W_TYPE_SIZE == 64
+/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
+   "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
+   code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
+   register, which takes an extra cycle.  */
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
+  do {                                          \
+    UWtype __x;                                 \
+    __x = (al) - (bl);                          \
+    if ((al) < (bl))                            \
+      (sh) = (ah) - (bh) - 1;                   \
+    else                                        \
+      (sh) = (ah) - (bh);                       \
+    (sl) = __x;                                 \
+  } while (0)
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+/* Do both product parts in assembly, since that gives better code with
+   all gcc versions.  Some callers will just use the upper part, and in
+   that situation we waste an instruction, but not any cycles.  */
+#define umul_ppmm(ph, pl, m0, m1) \
+    __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"         \
+            : "=&f" (ph), "=f" (pl)                                    \
+            : "f" (m0), "f" (m1))
+#define UMUL_TIME 14
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    UWtype _x = (x), _y, _a, _c;                                       \
+    __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));             \
+    __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));           \
+    _c = (_a - 1) << 3;                                                        
\
+    _x >>= _c;                                                         \
+    if (_x >= 1 << 4)                                                  \
+      _x >>= 4, _c += 4;                                               \
+    if (_x >= 1 << 2)                                                  \
+      _x >>= 2, _c += 2;                                               \
+    _c += _x >> 1;                                                     \
+    (count) =  W_TYPE_SIZE - 1 - _c;                                   \
+  } while (0)
+/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
+   based, and we don't need a special case for x==0 here */
+#define count_trailing_zeros(count, x)                                 \
+  do {                                                                 \
+    UWtype __ctz_x = (x);                                              \
+    __asm__ ("popcnt %0 = %1"                                          \
+            : "=r" (count)                                             \
+            : "r" ((__ctz_x-1) & ~__ctz_x));                           \
+  } while (0)
+#endif
+#if defined (__INTEL_COMPILER)
+#include <ia64intrin.h>
+#define umul_ppmm(ph, pl, m0, m1)                                      \
+  do {                                                                 \
+    UWtype _m0 = (m0), _m1 = (m1);                                     \
+    ph = _m64_xmahu (_m0, _m1, 0);                                     \
+    pl = _m0 * _m1;                                                    \
+  } while (0)
+#endif
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;                                                    \
+    __di = __MPN(invert_limb) (d);                                     \
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                         \
+  } while (0)
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+#define UDIV_TIME 220
+#endif
+
+
+#if defined (__GNUC__)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+   understood by gcc1.  Use cpp to avoid major code duplication.  */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                             \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                             \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {                                                                 \
+    USItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("multiplu %0,%1,%2"                                       \
+            : "=r" (xl)                                                \
+            : "r" (__m0), "r" (__m1));                                 \
+    __asm__ ("multmu %0,%1,%2"                                         \
+            : "=r" (xh)                                                \
+            : "r" (__m0), "r" (__m1));                                 \
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("dividu %0,%3,%4"                                           \
+          : "=r" (q), "=q" (r)                                         \
+          : "1" (n1), "r" (n0), "r" (d))
+#define count_leading_zeros(count, x) \
+    __asm__ ("clz %0,%1"                                               \
+            : "=r" (count)                                             \
+            : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* __a29k__ */
+
+#if defined (__arc__)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                      \
+          : "=r" (sh),                                                 \
+            "=&r" (sl)                                                 \
+          : "r"  ((USItype) (ah)),                                     \
+            "rIJ" ((USItype) (bh)),                                    \
+            "%r" ((USItype) (al)),                                     \
+            "rIJ" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                      \
+          : "=r" (sh),                                                 \
+            "=&r" (sl)                                                 \
+          : "r" ((USItype) (ah)),                                      \
+            "rIJ" ((USItype) (bh)),                                    \
+            "r" ((USItype) (al)),                                      \
+            "rIJ" ((USItype) (bl)))
+#endif
+
+#if defined (__arm__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    if (__builtin_constant_p (al))                                     \
+      {                                                                        
\
+       if (__builtin_constant_p (ah))                                  \
+         __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+       else                                                            \
+         __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+      }                                                                        
\
+    else if (__builtin_constant_p (ah))                                        
\
+      {                                                                        
\
+       if (__builtin_constant_p (bl))                                  \
+         __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+       else                                                            \
+         __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+      }                                                                        
\
+    else if (__builtin_constant_p (bl))                                        
\
+      {                                                                        
\
+       if (__builtin_constant_p (bh))                                  \
+         __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+       else                                                            \
+         __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
+                  : "=r" (sh), "=&r" (sl)                              \
+                  : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+      }                                                                        
\
+    else /* only bh might be a constant */                             \
+      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                   \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
+    } while (0)
+#if 1 || defined (__arm_m__)   /* `M' series has widening multiply support */
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define UMUL_TIME 5
+#define smul_ppmm(xh, xl, a, b) \
+  __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;                                                    \
+    __di = __MPN(invert_limb) (d);                                     \
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                         \
+  } while (0)
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 70
+#endif /* LONGLONG_STANDALONE */
+#else
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("%@ Inlined umul_ppmm\n"                                    \
+"      mov     %|r0, %2, lsr #16\n"                                    \
+"      mov     %|r2, %3, lsr #16\n"                                    \
+"      bic     %|r1, %2, %|r0, lsl #16\n"                              \
+"      bic     %|r2, %3, %|r2, lsl #16\n"                              \
+"      mul     %1, %|r1, %|r2\n"                                       \
+"      mul     %|r2, %|r0, %|r2\n"                                     \
+"      mul     %|r1, %0, %|r1\n"                                       \
+"      mul     %0, %|r0, %0\n"                                         \
+"      adds    %|r1, %|r2, %|r1\n"                                     \
+"      addcs   %0, %0, #65536\n"                                       \
+"      adds    %1, %1, %|r1, lsl #16\n"                                \
+"      adc     %0, %0, %|r1, lsr #16"                                  \
+          : "=&r" (xh), "=r" (xl)                                      \
+          : "r" (a), "r" (b)                                           \
+          : "r0", "r1", "r2")
+#define UMUL_TIME 20
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __r;                                                     \
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                   \
+    (r) = __r;                                                         \
+  } while (0)
+extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
+#define UDIV_TIME 200
+#endif /* LONGLONG_STANDALONE */
+#endif
+/* This is a bizarre test, but GCC doesn't define useful common symbol. */
+#if defined (__ARM_ARCH_5__)  || defined (__ARM_ARCH_5T__) || \
+    defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
+    defined (__ARM_ARCH_6__)  || defined (__ARM_ARCH_6J__) || \
+    defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \
+    defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \
+    defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__)  || \
+    defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \
+    defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
+#define count_leading_zeros(count, x) \
+  __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* __arm__ */
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "r" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    if (__builtin_constant_p (bl))                                     \
+      {                                                                        
\
+       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                  \
+                : "=r" (sh), "=&r" (sl)                                \
+                : "r" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+      }                                                                        
\
+    else /* only bh might be a constant */                             \
+      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                   \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "r" (ah), "rZ" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
+    } while (0)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1));    \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#endif /* __aarch64__ */
+
+#if defined (__clipper__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __x;                                                        \
+  __asm__ ("mulwux %2,%0"                                              \
+          : "=r" (__x.__ll)                                            \
+          : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define smul_ppmm(w1, w0, u, v) \
+  ({union {DItype __ll;                                                        
\
+          struct {SItype __l, __h;} __i;                               \
+         } __x;                                                        \
+  __asm__ ("mulwx %2,%0"                                               \
+          : "=r" (__x.__ll)                                            \
+          : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;                                                       \
+    __asm__ ("mulwux %2,%0"                                            \
+            : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
+    __w; })
+#endif /* __clipper__ */
+
+/* Fujitsu vector computers.  */
+#if defined (__uxp__) && W_TYPE_SIZE == 32
+#define umul_ppmm(ph, pl, u, v) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mult.lu %1,%2,%0"        : "=r" (__x.__ll) : "%r" (u), "rK" 
(v));\
+    (ph) = __x.__i.__h;                                                        
\
+    (pl) = __x.__i.__l;                                                        
\
+  } while (0)
+#define smul_ppmm(ph, pl, u, v) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));        
\
+    (ph) = __x.__i.__h;                                                        
\
+    (pl) = __x.__i.__l;                                                        
\
+  } while (0)
+#endif
+
+#if defined (__gmicro__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.w %5,%1\n\taddx %3,%0"                                 \
+          : "=g" (sh), "=&g" (sl)                                      \
+          : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                 \
+          : "=g" (sh), "=&g" (sl)                                      \
+          : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+  __asm__ ("mulx %3,%0,%1"                                             \
+          : "=g" (ph), "=r" (pl)                                       \
+          : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("divx %4,%0,%1"                                             \
+          : "=g" (q), "=r" (r)                                         \
+          : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
+#define count_leading_zeros(count, x) \
+  __asm__ ("bsch/1 %1,%0"                                              \
+          : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
+#endif
+
+#if defined (__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
+#if defined (_PA_RISC1_1)
+#define umul_ppmm(wh, wl, u, v) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));        
\
+    (wh) = __x.__i.__h;                                                        
\
+    (wl) = __x.__i.__l;                                                        
\
+  } while (0)
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#else
+#define UMUL_TIME 40
+#define UDIV_TIME 80
+#endif
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    USItype __tmp;                                                     \
+    __asm__ (                                                          \
+       "ldi            1,%0\n"                                         \
+"      extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
+"      extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
+"      ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
+"      extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
+"      extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
+"      ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
+"      extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
+"      extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
+"      ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
+"      extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
+"      extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
+"      ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
+"      extru           %1,30,1,%1      ; Extract bit 1.\n"             \
+"      sub             %0,%1,%0        ; Subtract it.\n"               \
+       : "=r" (count), "=r" (__tmp) : "1" (x));                        \
+  } while (0)
+#endif /* hppa */
+
+/* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
+   (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
+   is just a case of no direct support for 2.0n but treating it like 1.0. */
+#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                     \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                     \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
+#endif /* hppa */
+
+#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && 
W_TYPE_SIZE == 32
+#if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
+  do {                                                                 \
+/*  if (__builtin_constant_p (bl))                                     \
+      __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                           \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
+    else                                                               \
+*/    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                             \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)                             \
+  do {                                                                 \
+/*  if (__builtin_constant_p (bl))                                     \
+      __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                           \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
+    else                                                               \
+*/    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                             \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
+  } while (0)
+#if __GMP_GNUC_PREREQ (4,5)
+#define umul_ppmm(xh, xl, m0, m1)                                      \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __x.__ll = (UDItype) (m0) * (UDItype) (m1);                                
\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+  } while (0)
+#else
+#if 0
+/* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
+   with a new enough processor pretending we have 32-bit registers.  */
+#define umul_ppmm(xh, xl, m0, m1)                                      \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mlr\t%0,%2"                                              \
+            : "=r" (__x.__ll)                                          \
+            : "%0" (m0), "r" (m1));                                    \
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+  } while (0)
+#else
+#define umul_ppmm(xh, xl, m0, m1)                                      \
+  do {                                                                 \
+  /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
+     DImode for the product, since that would be allocated to a single 64-bit
+     register, whereas mlr uses the low 32-bits of an even-odd register pair.
+  */                                                                   \
+    register USItype __r0 __asm__ ("0");                               \
+    register USItype __r1 __asm__ ("1") = (m0);                                
\
+    __asm__ ("mlr\t%0,%3"                                              \
+            : "=r" (__r0), "=r" (__r1)                                 \
+            : "r" (__r1), "r" (m1));                                   \
+    (xh) = __r0; (xl) = __r1;                                          \
+  } while (0)
+#endif /* if 0 */
+#endif
+#if 0
+/* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
+   with a new enough processor pretending we have 32-bit registers.  */
+#define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __x.__i.__h = n1; __x.__i.__l = n0;                                        
\
+    __asm__ ("dlr\t%0,%2"                                              \
+            : "=r" (__x.__ll)                                          \
+            : "0" (__x.__ll), "r" (d));                                \
+    (q) = __x.__i.__l; (r) = __x.__i.__h;                              \
+  } while (0)
+#else
+#define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    register USItype __r0 __asm__ ("0") = (n1);                                
\
+    register USItype __r1 __asm__ ("1") = (n0);                                
\
+    __asm__ ("dlr\t%0,%4"                                              \
+            : "=r" (__r0), "=r" (__r1)                                 \
+            : "r" (__r0), "r" (__r1), "r" (d));                        \
+    (q) = __r1; (r) = __r0;                                            \
+  } while (0)
+#endif /* if 0 */
+#else /* if __zarch__ */
+/* FIXME: this fails if gcc knows about the 64-bit registers.  */
+#define smul_ppmm(xh, xl, m0, m1)                                      \
+  do {                                                                 \
+    union {DItype __ll;                                                        
\
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mr\t%0,%2"                                               \
+            : "=r" (__x.__ll)                                          \
+            : "%0" (m0), "r" (m1));                                    \
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+  } while (0)
+/* FIXME: this fails if gcc knows about the 64-bit registers.  */
+#define sdiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    union {DItype __ll;                                                        
\
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __x.__i.__h = n1; __x.__i.__l = n0;                                        
\
+    __asm__ ("dr\t%0,%2"                                               \
+            : "=r" (__x.__ll)                                          \
+            : "0" (__x.__ll), "r" (d));                                \
+    (q) = __x.__i.__l; (r) = __x.__i.__h;                              \
+  } while (0)
+#endif /* if __zarch__ */
+#endif
+
+#if defined (__s390x__) && W_TYPE_SIZE == 64
+/* We need to cast operands with register constraints, otherwise their types
+   will be assumed to be SImode by gcc.  For these machines, such operations
+   will insert a value into the low 32 bits, and leave the high 32 bits with
+   garbage.  */
+#define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
+  do {                                                                 \
+    __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                             \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
+                "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)                             \
+  do {                                                                 \
+    __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                             \
+            : "=r" (sh), "=&r" (sl)                                    \
+            : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
+              "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
+  } while (0)
+#define umul_ppmm(xh, xl, m0, m1)                                      \
+  do {                                                                 \
+    union {unsigned int __attribute__ ((mode(TI))) __ll;               \
+          struct {UDItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mlgr\t%0,%2"                                             \
+            : "=r" (__x.__ll)                                          \
+            : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    union {unsigned int __attribute__ ((mode(TI))) __ll;               \
+          struct {UDItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __x.__i.__h = n1; __x.__i.__l = n0;                                        
\
+    __asm__ ("dlgr\t%0,%2"                                             \
+            : "=r" (__x.__ll)                                          \
+            : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
+    (q) = __x.__i.__l; (r) = __x.__i.__h;                              \
+  } while (0)
+#if 0 /* FIXME: Enable for z10 (?) */
+#define count_leading_zeros(cnt, x)                                    \
+  do {                                                                 \
+    union {unsigned int __attribute__ ((mode(TI))) __ll;               \
+          struct {UDItype __h, __l;} __i;                              \
+         } __clr_cnt;                                                  \
+    __asm__ ("flogr\t%0,%1"                                            \
+            : "=r" (__clr_cnt.__ll)                                    \
+            : "r" (x) __CLOBBER_CC);                                   \
+    (cnt) = __clr_cnt.__i.__h;                                         \
+  } while (0)
+#endif
+#endif
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                        
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                        
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mull %3"                                                   \
+          : "=a" (w0), "=d" (w1)                                       \
+          : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
+  __asm__ ("divl %4"                /* stringification in K&R C */     \
+          : "=a" (q), "=d" (r)                                         \
+          : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
+
+#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
+/* Pentium bsrl takes between 10 and 72 cycles depending where the most
+   significant 1 bit is, hence the use of the following alternatives.  bsfl
+   is slow too, between 18 and 42 depending where the least significant 1
+   bit is, so let the generic count_trailing_zeros below make use of the
+   count_leading_zeros here too.  */
+
+#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
+/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
+   cache miss reading from __clz_tab.  For P55 it's favoured over the float
+   below so as to avoid mixing MMX and x87, since the penalty for switching
+   between the two is about 100 cycles.
+
+   The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
+   16, -1 for 8, or 0 otherwise.  This could be written equivalently as
+   follows, but as of gcc 2.95.2 it results in conditional jumps.
+
+       __shift = -(__n < 0x1000000);
+       __shift -= (__n < 0x10000);
+       __shift -= (__n < 0x100);
+
+   The middle two sbbl and cmpl's pair, and with luck something gcc
+   generates might pair with the first cmpl and the last sbbl.  The "32+1"
+   constant could be folded into __clz_tab[], but it doesn't seem worth
+   making a different table just for that.  */
+
+#define count_leading_zeros(c,n)                                       \
+  do {                                                                 \
+    USItype  __n = (n);                                                        
\
+    USItype  __shift;                                                  \
+    __asm__ ("cmpl  $0x1000000, %1\n"                                  \
+            "sbbl  %0, %0\n"                                           \
+            "cmpl  $0x10000, %1\n"                                     \
+            "sbbl  $0, %0\n"                                           \
+            "cmpl  $0x100, %1\n"                                       \
+            "sbbl  $0, %0\n"                                           \
+            : "=&r" (__shift) : "r"  (__n));                           \
+    __shift = __shift*8 + 24 + 1;                                      \
+    (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                        
\
+  } while (0)
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
+
+#else /* ! pentiummmx || LONGLONG_STANDALONE */
+/* The following should be a fixed 14 cycles or so.  Some scheduling
+   opportunities should be available between the float load/store too.  This
+   sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
+   apparently suggested by the Intel optimizing manual (don't know exactly
+   where).  gcc 2.95 or up will be best for this, so the "double" is
+   correctly aligned on the stack.  */
+#define count_leading_zeros(c,n)                                       \
+  do {                                                                 \
+    union {                                                            \
+      double    d;                                                     \
+      unsigned  a[2];                                                  \
+    } __u;                                                             \
+    ASSERT ((n) != 0);                                                 \
+    __u.d = (UWtype) (n);                                              \
+    (c) = 0x3FF + 31 - (__u.a[1] >> 20);                               \
+  } while (0)
+#define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
+#endif /* pentiummx */
+
+#else /* ! pentium */
+
+#if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
+#define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
+#endif /* gcc clz */
+
+/* On P6, gcc prior to 3.0 generates a partial register stall for
+   __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
+   being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
+   cost of one extra instruction.  Do this for "i386" too, since that means
+   generic x86.  */
+#if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
+  && (HAVE_HOST_CPU_i386                                               \
+      || HAVE_HOST_CPU_i686                                            \
+      || HAVE_HOST_CPU_pentiumpro                                      \
+      || HAVE_HOST_CPU_pentium2                                                
\
+      || HAVE_HOST_CPU_pentium3)
+#define count_leading_zeros(count, x)                                  \
+  do {                                                                 \
+    USItype __cbtmp;                                                   \
+    ASSERT ((x) != 0);                                                 \
+    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));     \
+    (count) = 31 - __cbtmp;                                            \
+  } while (0)
+#endif /* gcc<3 asm bsrl */
+
+#ifndef count_leading_zeros
+#define count_leading_zeros(count, x)                                  \
+  do {                                                                 \
+    USItype __cbtmp;                                                   \
+    ASSERT ((x) != 0);                                                 \
+    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));     \
+    (count) = __cbtmp ^ 31;                                            \
+  } while (0)
+#endif /* asm bsrl */
+
+#if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
+#define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
+#endif /* gcc ctz */
+
+#ifndef count_trailing_zeros
+#define count_trailing_zeros(count, x)                                 \
+  do {                                                                 \
+    ASSERT ((x) != 0);                                                 \
+    __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));      \
+  } while (0)
+#endif /* asm bsfl */
+
+#endif /* ! pentium */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 10
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 40
+#endif
+#endif /* 80x86 */
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                        
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
+            "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                        
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
+            "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulq %3"                                                   \
+          : "=a" (w0), "=d" (w1)                                       \
+          : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
+  __asm__ ("divq %4"                /* stringification in K&R C */     \
+          : "=a" (q), "=d" (r)                                         \
+          : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
+/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
+#define count_leading_zeros(count, x)                                  \
+  do {                                                                 \
+    UDItype __cbtmp;                                                   \
+    ASSERT ((x) != 0);                                                 \
+    __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));     \
+    (count) = __cbtmp ^ 63;                                            \
+  } while (0)
+/* bsfq destination must be a 64-bit register, "%q0" forces this in case
+   count is only an int. */
+#define count_trailing_zeros(count, x)                                 \
+  do {                                                                 \
+    ASSERT ((x) != 0);                                                 \
+    __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));      \
+  } while (0)
+#endif /* x86_64 */
+
+#if defined (__i860__) && W_TYPE_SIZE == 32
+#define rshift_rhlc(r,h,l,c) \
+  __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                               \
+          "=r" (r) : "r" (h), "r" (l), "rn" (c))
+#endif /* i860 */
+
+#if defined (__i960__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                    \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                    \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __x;                                                        \
+  __asm__ ("emul %2,%1,%0"                                             \
+          : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;                                                       \
+    __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));      \
+    __w; })
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __nn;                                                       \
+    __nn.__i.__h = (nh); __nn.__i.__l = (nl);                          \
+    __asm__ ("ediv %d,%n,%0"                                           \
+          : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
+    (r) = __rq.__i.__l; (q) = __rq.__i.__h;                            \
+  } while (0)
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    USItype __cbtmp;                                                   \
+    __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));              \
+    (count) = __cbtmp ^ 31;                                            \
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
+#if defined (__i960mx)         /* what is the proper symbol to test??? */
+#define rshift_rhlc(r,h,l,c) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __nn;                                                       \
+    __nn.__i.__h = (h); __nn.__i.__l = (l);                            \
+    __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
+  }
+#endif /* i960mx */
+#endif /* i960 */
+
+#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
+     || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
+     || defined (__mc5307__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                             \
+          : "=d" (sh), "=&d" (sl)                                      \
+          : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                             \
+          : "=d" (sh), "=&d" (sl)                                      \
+          : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
+#if defined (__mc68020__) || defined(mc68020) \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mcpu32__) || defined (mcpu32) \
+     || defined (__NeXT__)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulu%.l %3,%1:%0"                                          \
+          : "=d" (w0), "=d" (w1)                                       \
+          : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divu%.l %4,%1:%0"                                          \
+          : "=d" (q), "=d" (r)                                         \
+          : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divs%.l %4,%1:%0"                                          \
+          : "=d" (q), "=d" (r)                                         \
+          : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#else /* for other 68k family members use 16x16->32 multiplication */
+#define umul_ppmm(xh, xl, a, b) \
+  do { USItype __umul_tmp1, __umul_tmp2;                               \
+       __asm__ ("| Inlined umul_ppmm\n"                                \
+"      move%.l %5,%3\n"                                                \
+"      move%.l %2,%0\n"                                                \
+"      move%.w %3,%1\n"                                                \
+"      swap    %3\n"                                                   \
+"      swap    %0\n"                                                   \
+"      mulu%.w %2,%1\n"                                                \
+"      mulu%.w %3,%0\n"                                                \
+"      mulu%.w %2,%3\n"                                                \
+"      swap    %2\n"                                                   \
+"      mulu%.w %5,%2\n"                                                \
+"      add%.l  %3,%2\n"                                                \
+"      jcc     1f\n"                                                   \
+"      add%.l  %#0x10000,%0\n"                                         \
+"1:    move%.l %2,%3\n"                                                \
+"      clr%.w  %2\n"                                                   \
+"      swap    %2\n"                                                   \
+"      swap    %3\n"                                                   \
+"      clr%.w  %3\n"                                                   \
+"      add%.l  %3,%1\n"                                                \
+"      addx%.l %2,%0\n"                                                \
+"      | End inlined umul_ppmm"                                        \
+             : "=&d" (xh), "=&d" (xl),                                 \
+               "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
+             : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
+  } while (0)
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#endif /* not mc68020 */
+/* The '020, '030, '040 and '060 have bitfield insns.
+   GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
+   exclude bfffo on that chip (bitfield insns not available).  */
+#if (defined (__mc68020__) || defined (mc68020)    \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mc68060__) || defined (mc68060) \
+     || defined (__NeXT__))                        \
+  && ! defined (__mcpu32__)
+#define count_leading_zeros(count, x) \
+  __asm__ ("bfffo %1{%b2:%b2},%0"                                      \
+          : "=d" (count)                                               \
+          : "od" ((USItype) (x)), "n" (0))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* mc68000 */
+
+#if defined (__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                  \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                  \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    USItype __cbtmp;                                                   \
+    __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                  \
+    (count) = __cbtmp ^ 31;                                            \
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 63 /* sic */
+#if defined (__m88110__)
+#define umul_ppmm(wh, wl, u, v) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+    __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));  \
+    (wh) = __x.__i.__h;                                                        
\
+    (wl) = __x.__i.__l;                                                        
\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x, __q;                                                   \
+  __x.__i.__h = (n1); __x.__i.__l = (n0);                              \
+  __asm__ ("divu.d %0,%1,%2"                                           \
+          : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
+  (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __m88110__ */
+#endif /* __m88000__ */
+
+#if defined (__mips) && W_TYPE_SIZE == 32
+#if __GMP_GNUC_PREREQ (4,4)
+#define umul_ppmm(w1, w0, u, v) \
+  do {                                                                 \
+    UDItype __ll = (UDItype)(u) * (v);                                 \
+    w1 = __ll >> 32;                                                   \
+    w0 = __ll;                                                         \
+  } while (0)
+#endif
+#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#endif
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
+          : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+#endif /* __mips */
+
+#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if __GMP_GNUC_PREREQ (4,4)
+#define umul_ppmm(w1, w0, u, v) \
+  do {                                                                 \
+    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));       \
+    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                       \
+    w1 = __ll >> 64;                                                   \
+    w0 = __ll;                                                         \
+  } while (0)
+#endif
+#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#endif
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                                
\
+          : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
+#endif /* __mips */
+
+#if defined (__mmix__) && W_TYPE_SIZE == 64
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
+#endif
+
+#if defined (__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __x;                                                        \
+  __asm__ ("meid %2,%0"                                                        
\
+          : "=g" (__x.__ll)                                            \
+          : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;                                                       \
+    __asm__ ("meid %2,%0"                                              \
+            : "=g" (__w)                                               \
+            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
+    __w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __x;                                                        \
+  __x.__i.__h = (n1); __x.__i.__l = (n0);                              \
+  __asm__ ("deid %2,%0"                                                        
\
+          : "=g" (__x.__ll)                                            \
+          : "0" (__x.__ll), "g" ((USItype)(d)));                       \
+  (r) = __x.__i.__l; (q) = __x.__i.__h; })
+#define count_trailing_zeros(count,x) \
+  do {                                                                 \
+    __asm__ ("ffsd     %2,%0"                                          \
+            : "=r" (count)                                             \
+            : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
+  } while (0)
+#endif /* __ns32000__ */
+
+/* In the past we had a block of various #defines tested
+       _ARCH_PPC    - AIX
+       _ARCH_PWR    - AIX
+       __powerpc__  - gcc
+       __POWERPC__  - BEOS
+       __ppc__      - Darwin
+       PPC          - old gcc, GNU/Linux, SysV
+   The plain PPC test was not good for vxWorks, since PPC is defined on all
+   CPUs there (eg. m68k too), as a constant one is expected to compare
+   CPU_FAMILY against.
+
+   At any rate, this was pretty unattractive and a bit fragile.  The use of
+   HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
+   getting the desired effect.
+
+   ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
+   the system vendor compilers.  (Is that vendor compilers with inline asm,
+   or what?)  */
+
+#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
+  && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    if (__builtin_constant_p (bh) && (bh) == 0)                                
\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"          \
+            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                
\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"          \
+            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else                                                               \
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"         \
+            : "=r" (sh), "=&r" (sl)                                    \
+            : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    if (__builtin_constant_p (ah) && (ah) == 0)                                
\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"      \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)                
\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"      \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)                   \
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"                
\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                
\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"                
\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else                                                               \
+      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"     \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#if HAVE_HOST_CPU_FAMILY_powerpc
+#if __GMP_GNUC_PREREQ (4,4)
+#define umul_ppmm(w1, w0, u, v) \
+  do {                                                                 \
+    UDItype __ll = (UDItype)(u) * (v);                                 \
+    w1 = __ll >> 32;                                                   \
+    w0 = __ll;                                                         \
+  } while (0)
+#endif
+#if !defined (umul_ppmm)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    USItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));     \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#endif
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    SItype __m0 = (m0), __m1 = (m1);                                   \
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#else
+#define UMUL_TIME 8
+#define smul_ppmm(xh, xl, m0, m1) \
+  __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
+#define SMUL_TIME 4
+#define sdiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
+#define UDIV_TIME 100
+#endif
+#endif /* 32-bit POWER architecture variants.  */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
+#if !defined (_LONG_LONG_LIMB)
+/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
+   use adde etc only when not _LONG_LONG_LIMB.  */
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    if (__builtin_constant_p (bh) && (bh) == 0)                                
\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"          \
+            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)                
\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"          \
+            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else                                                               \
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"         \
+            : "=r" (sh), "=&r" (sl)                                    \
+            : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
+  } while (0)
+/* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
+   This might seem strange, but gcc folds away the dead code late.  */
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                       \
+    if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {         \
+       if (__builtin_constant_p (ah) && (ah) == 0)                           \
+         __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2"               \
+                  : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); 
\
+       else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
+         __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2"               \
+                  : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); 
\
+       else if (__builtin_constant_p (bh) && (bh) == 0)                      \
+         __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2"                 \
+                  : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); 
\
+       else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
+         __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2"                 \
+                  : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); 
\
+       else                                                                  \
+         __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2"              \
+                  : "=r" (sh), "=&r" (sl)                                    \
+                  : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));             \
+      } else {                                                               \
+       if (__builtin_constant_p (ah) && (ah) == 0)                           \
+         __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"         \
+                  : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
+       else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)           \
+         __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"         \
+                  : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));  \
+       else if (__builtin_constant_p (bh) && (bh) == 0)                      \
+         __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"           \
+                  : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
+       else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)           \
+         __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"           \
+                  : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));  \
+       else                                                                  \
+         __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"        \
+                  : "=r" (sh), "=&r" (sl)                                    \
+                  : "r" (ah), "r" (bh), "rI" (al), "r" (bl));                \
+      }                                                                        
      \
+  } while (0)
+#endif /* ! _LONG_LONG_LIMB */
+#define count_leading_zeros(count, x) \
+  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
+#define umul_ppmm(w1, w0, u, v) \
+  do {                                                                 \
+    typedef unsigned int __ll_UTItype __attribute__((mode(TI)));       \
+    __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                       \
+    w1 = __ll >> 64;                                                   \
+    w0 = __ll;                                                         \
+  } while (0)
+#endif
+#if !defined (umul_ppmm)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    UDItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));     \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#endif
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {                                                                 \
+    DItype __m0 = (m0), __m1 = (m1);                                   \
+    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
+    (pl) = __m0 * __m1;                                                        
\
+  } while (0)
+#define SMUL_TIME 14  /* ??? */
+#define UDIV_TIME 120 /* ??? */
+#endif /* 64-bit PowerPC.  */
+
+#if defined (__pyr__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addw %5,%1\n\taddwc %3,%0"                                 \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                 \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;                                               \
+          struct {USItype __h, __l;} __i;                              \
+         } __x;                                                        \
+  __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                        
\
+          : "=&r" (__x.__ll)                                           \
+          : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#endif /* __pyr__ */
+
+#if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("a %1,%5\n\tae %0,%3"                                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("s %1,%5\n\tse %0,%3"                                       \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define smul_ppmm(ph, pl, m0, m1) \
+  __asm__ (                                                            \
+       "s      r2,r2\n"                                                \
+"      mts r10,%2\n"                                                   \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      m       r2,%3\n"                                                \
+"      cas     %0,r2,r0\n"                                             \
+"      mfs     r10,%1"                                                 \
+          : "=r" (ph), "=r" (pl)                                       \
+          : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
+          : "r2")
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    if ((x) >= 0x10000)                                                        
\
+      __asm__ ("clz    %0,%1"                                          \
+              : "=r" (count) : "r" ((USItype)(x) >> 16));              \
+    else                                                               \
+      {                                                                        
\
+       __asm__ ("clz   %0,%1"                                          \
+                : "=r" (count) : "r" ((USItype)(x)));                  \
+       (count) += 16;                                                  \
+      }                                                                        
\
+  } while (0)
+#endif /* RT/ROMP */
+
+#if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && 
W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"               \
+          : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                         \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
+          __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                         \
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
+          __CLOBBER_CC)
+/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
+   doesn't define anything to indicate that to us, it only sets __sparcv8. */
+#if defined (__sparc_v9__) || defined (__sparcv9)
+/* Perhaps we should use floating-point operations here?  */
+#if 0
+/* Triggers a bug making mpz/tests/t-gcd.c fail.
+   Perhaps we simply need explicitly zero-extend the inputs?  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :         \
+          "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
+#else
+/* Use v8 umul until above bug is fixed.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#endif
+/* Use a plain v8 divide for v9.  */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {                                                                 \
+    USItype __q;                                                       \
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                    \
+            : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
+    (r) = (n0) - __q * (d);                                            \
+    (q) = __q;                                                         \
+  } while (0)
+#else
+#if defined (__sparc_v8__)   /* gcc normal */                          \
+  || defined (__sparcv8)     /* gcc solaris */                         \
+  || HAVE_HOST_CPU_supersparc
+/* Don't match immediate range because, 1) it is not often useful,
+   2) the 'I' flag thinks of the range as a 13 bit signed interval,
+   while we want to match a 13 bit interval, sign extended to 32 bits,
+   but INTERPRETED AS UNSIGNED.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+
+#if HAVE_HOST_CPU_supersparc
+#define UDIV_TIME 60           /* SuperSPARC timing */
+#else
+/* Don't use this on SuperSPARC because its udiv only handles 53 bit
+   dividends and will trap to the kernel for the rest. */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {                                                                 \
+    USItype __q;                                                       \
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                    \
+            : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
+    (r) = (n0) - __q * (d);                                            \
+    (q) = __q;                                                         \
+  } while (0)
+#define UDIV_TIME 25
+#endif /* HAVE_HOST_CPU_supersparc */
+
+#else /* ! __sparc_v8__ */
+#if defined (__sparclite__)
+/* This has hardware multiply but not divide.  It also has two additional
+   instructions scan (ffs from high bit) and divscc.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("! Inlined udiv_qrnnd\n"                                    \
+"      wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
+"      tst     %%g0\n"                                                 \
+"      divscc  %3,%4,%%g1\n"                                           \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%%g1\n"                                         \
+"      divscc  %%g1,%4,%0\n"                                           \
+"      rd      %%y,%1\n"                                               \
+"      bl,a 1f\n"                                                      \
+"      add     %1,%4,%1\n"                                             \
+"1:    ! End of inline udiv_qrnnd"                                     \
+          : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
+          : "%g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#define count_leading_zeros(count, x) \
+  __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
+/* Early sparclites return 63 for an argument of 0, but they warn that future
+   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
+   undefined.  */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
+/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("! Inlined umul_ppmm\n"                                     \
+"      wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
+"      sra     %3,31,%%g2      ! Don't move this insn\n"               \
+"      and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
+"      andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,%3,%%g1\n"                                         \
+"      mulscc  %%g1,0,%%g1\n"                                          \
+"      add     %%g1,%%g2,%0\n"                                         \
+"      rd      %%y,%1"                                                 \
+          : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
+          : "%g1", "%g2" __AND_CLOBBER_CC)
+#define UMUL_TIME 39           /* 39 instructions */
+#endif
+#ifndef udiv_qrnnd
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __r;                                                     \
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                   \
+    (r) = __r;                                                         \
+  } while (0)
+extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
+#ifndef UDIV_TIME
+#define UDIV_TIME 140
+#endif
+#endif /* LONGLONG_STANDALONE */
+#endif /* udiv_qrnnd */
+#endif /* __sparc__ */
+
+#if defined (__sparc__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ (                                                            \
+       "addcc  %r4,%5,%1\n"                                            \
+      "        addccc  %r6,%7,%%g0\n"                                          
\
+      "        addc    %r2,%3,%0"                                              
\
+         : "=r" (sh), "=&r" (sl)                                       \
+         : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
+           "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
+          __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ (                                                            \
+       "subcc  %r4,%5,%1\n"                                            \
+      "        subccc  %r6,%7,%%g0\n"                                          
\
+      "        subc    %r2,%3,%0"                                              
\
+         : "=r" (sh), "=&r" (sl)                                       \
+         : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),         \
+           "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
+          __CLOBBER_CC)
+#endif
+
+#if defined (__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                 \
+          : "=g" (sh), "=&g" (sl)                                      \
+          : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
+            "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                 \
+          : "=g" (sh), "=&g" (sl)                                      \
+          : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
+            "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {                                                                 \
+    union {UDItype __ll;                                               \
+          struct {USItype __l, __h;} __i;                              \
+         } __x;                                                        \
+    USItype __m0 = (m0), __m1 = (m1);                                  \
+    __asm__ ("emul %1,%2,$0,%0"                                                
\
+            : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {                                                                 \
+    union {DItype __ll;                                                        
\
+          struct {SItype __l, __h;} __i;                               \
+         } __x;                                                        \
+    __x.__i.__h = n1; __x.__i.__l = n0;                                        
\
+    __asm__ ("ediv %3,%2,%0,%1"                                                
\
+            : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
+  } while (0)
+#if 0
+/* FIXME: This instruction appears to be unimplemented on some systems (vax
+   8800 maybe). */
+#define count_trailing_zeros(count,x)                                  \
+  do {                                                                 \
+    __asm__ ("ffs 0, 31, %1, %0"                                       \
+            : "=g" (count)                                             \
+            : "g" ((USItype) (x)));                                    \
+  } while (0)
+#endif
+#endif /* __vax__ */
+
+#if defined (__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add        %H1,%H5\n\tadc  %H0,%H3"                                
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
+            "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub        %H1,%H5\n\tsbc  %H0,%H3"                                
\
+          : "=r" (sh), "=&r" (sl)                                      \
+          : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
+            "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {                                                                 \
+    union {long int __ll;                                              \
+          struct {unsigned int __h, __l;} __i;                         \
+         } __x;                                                        \
+    unsigned int __m0 = (m0), __m1 = (m1);                             \
+    __asm__ ("mult     %S0,%H3"                                        \
+            : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
+            : "%1" (m0), "rQR" (m1));                                  \
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;                            \
+    (xh) += ((((signed int) __m0 >> 15) & __m1)                                
\
+            + (((signed int) __m1 >> 15) & __m0));                     \
+  } while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+#endif /* NO_ASM */
+
+
+/* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
+#if !defined (umul_ppmm) && defined (__umulsidi3)
+#define umul_ppmm(ph, pl, m0, m1) \
+  {                                                                    \
+    UDWtype __ll = __umulsidi3 (m0, m1);                               \
+    ph = (UWtype) (__ll >> W_TYPE_SIZE);                               \
+    pl = (UWtype) __ll;                                                        
\
+  }
+#endif
+
+#if !defined (__umulsidi3)
+#define __umulsidi3(u, v) \
+  ({UWtype __hi, __lo;                                                 \
+    umul_ppmm (__hi, __lo, u, v);                                      \
+    ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
+#endif
+
+
+/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
+   forms have "reversed" arguments, meaning the pointer is last, which
+   sometimes allows better parameter passing, in particular on 64-bit
+   hppa. */
+
+#define mpn_umul_ppmm  __MPN(umul_ppmm)
+extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
+
+#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
+  && ! defined (LONGLONG_STANDALONE)
+#define umul_ppmm(wh, wl, u, v)                                                
      \
+  do {                                                                       \
+    UWtype __umul_ppmm__p0;                                                  \
+    (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));      \
+    (wl) = __umul_ppmm__p0;                                                  \
+  } while (0)
+#endif
+
+#define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
+extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
+
+#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r       \
+  && ! defined (LONGLONG_STANDALONE)
+#define umul_ppmm(wh, wl, u, v)                                                
      \
+  do {                                                                       \
+    UWtype __umul_ppmm__p0;                                                  \
+    (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0);    \
+    (wl) = __umul_ppmm__p0;                                                  \
+  } while (0)
+#endif
+
+#define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
+extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
+
+#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd       \
+  && ! defined (LONGLONG_STANDALONE)
+#define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    UWtype __udiv_qrnnd__r;                                            \
+    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                            \
+                         (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
+    (r) = __udiv_qrnnd__r;                                             \
+  } while (0)
+#endif
+
+#define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
+extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
+
+#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r     \
+  && ! defined (LONGLONG_STANDALONE)
+#define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                 \
+    UWtype __udiv_qrnnd__r;                                            \
+    (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,  \
+                           &__udiv_qrnnd__r);                          \
+    (r) = __udiv_qrnnd__r;                                             \
+  } while (0)
+#endif
+
+
+/* If this machine has no inline assembler, use C macros.  */
+
+#if !defined (add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    UWtype __x;                                                                
\
+    __x = (al) + (bl);                                                 \
+    (sh) = (ah) + (bh) + (__x < (al));                                 \
+    (sl) = __x;                                                                
\
+  } while (0)
+#endif
+
+#if !defined (sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    UWtype __x;                                                                
\
+    __x = (al) - (bl);                                                 \
+    (sh) = (ah) - (bh) - ((al) < (bl));                                 \
+    (sl) = __x;                                                                
\
+  } while (0)
+#endif
+
+/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
+   smul_ppmm.  */
+#if !defined (umul_ppmm) && defined (smul_ppmm)
+#define umul_ppmm(w1, w0, u, v)                                                
\
+  do {                                                                 \
+    UWtype __w1;                                                       \
+    UWtype __xm0 = (u), __xm1 = (v);                                   \
+    smul_ppmm (__w1, w0, __xm0, __xm1);                                        
\
+    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)              \
+               + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
+  } while (0)
+#endif
+
+/* If we still don't have umul_ppmm, define it using plain C.
+
+   For reference, when this code is used for squaring (ie. u and v identical
+   expressions), gcc recognises __x1 and __x2 are the same and generates 3
+   multiplies, not 4.  The subsequent additions could be optimized a bit,
+   but the only place GMP currently uses such a square is mpn_sqr_basecase,
+   and chips obliged to use this generic C umul will have plenty of worse
+   performance problems than a couple of extra instructions on the diagonal
+   of sqr_basecase.  */
+
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v)                                                
\
+  do {                                                                 \
+    UWtype __x0, __x1, __x2, __x3;                                     \
+    UHWtype __ul, __vl, __uh, __vh;                                    \
+    UWtype __u = (u), __v = (v);                                       \
+                                                                       \
+    __ul = __ll_lowpart (__u);                                         \
+    __uh = __ll_highpart (__u);                                                
\
+    __vl = __ll_lowpart (__v);                                         \
+    __vh = __ll_highpart (__v);                                                
\
+                                                                       \
+    __x0 = (UWtype) __ul * __vl;                                       \
+    __x1 = (UWtype) __ul * __vh;                                       \
+    __x2 = (UWtype) __uh * __vl;                                       \
+    __x3 = (UWtype) __uh * __vh;                                       \
+                                                                       \
+    __x1 += __ll_highpart (__x0);/* this can't give carry */           \
+    __x1 += __x2;              /* but this indeed can */               \
+    if (__x1 < __x2)           /* did we get it? */                    \
+      __x3 += __ll_B;          /* yes, add it in the proper pos. */    \
+                                                                       \
+    (w1) = __x3 + __ll_highpart (__x1);                                        
\
+    (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);              \
+  } while (0)
+#endif
+
+/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
+   exist in one form or another.  */
+#if !defined (smul_ppmm)
+#define smul_ppmm(w1, w0, u, v)                                                
\
+  do {                                                                 \
+    UWtype __w1;                                                       \
+    UWtype __xm0 = (u), __xm1 = (v);                                   \
+    umul_ppmm (__w1, w0, __xm0, __xm1);                                        
\
+    (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)              \
+               - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
+  } while (0)
+#endif
+
+/* Define this unconditionally, so it can be used for debugging.  */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {                                                                 \
+    UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                    \
+                                                                       \
+    ASSERT ((d) != 0);                                                 \
+    ASSERT ((n1) < (d));                                               \
+                                                                       \
+    __d1 = __ll_highpart (d);                                          \
+    __d0 = __ll_lowpart (d);                                           \
+                                                                       \
+    __q1 = (n1) / __d1;                                                        
\
+    __r1 = (n1) - __q1 * __d1;                                         \
+    __m = __q1 * __d0;                                                 \
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);                         \
+    if (__r1 < __m)                                                    \
+      {                                                                        
\
+       __q1--, __r1 += (d);                                            \
+       if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+         if (__r1 < __m)                                               \
+           __q1--, __r1 += (d);                                        \
+      }                                                                        
\
+    __r1 -= __m;                                                       \
+                                                                       \
+    __q0 = __r1 / __d1;                                                        
\
+    __r0 = __r1  - __q0 * __d1;                                                
\
+    __m = __q0 * __d0;                                                 \
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);                          \
+    if (__r0 < __m)                                                    \
+      {                                                                        
\
+       __q0--, __r0 += (d);                                            \
+       if (__r0 >= (d))                                                \
+         if (__r0 < __m)                                               \
+           __q0--, __r0 += (d);                                        \
+      }                                                                        
\
+    __r0 -= __m;                                                       \
+                                                                       \
+    (q) = __q1 * __ll_B | __q0;                                                
\
+    (r) = __r0;                                                                
\
+  } while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
+#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {                                                                 \
+    UWtype __r;                                                                
\
+    (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                                
\
+    (r) = __r;                                                         \
+  } while (0)
+__GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
+#endif
+
+/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
+#if !defined (udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#if !defined (count_leading_zeros)
+#define count_leading_zeros(count, x) \
+  do {                                                                 \
+    UWtype __xr = (x);                                                 \
+    UWtype __a;                                                                
\
+                                                                       \
+    if (W_TYPE_SIZE == 32)                                             \
+      {                                                                        
\
+       __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
+         ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
+         : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
+         : 3*__BITS4 + 1);                                             \
+      }                                                                        
\
+    else                                                               \
+      {                                                                        
\
+       for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
+         if (((__xr >> __a) & 0xff) != 0)                              \
+           break;                                                      \
+       ++__a;                                                          \
+      }                                                                        
\
+                                                                       \
+    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];          \
+  } while (0)
+/* This version gives a well-defined value for zero. */
+#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#define COUNT_LEADING_ZEROS_SLOW
+#endif
+
+/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
+#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#endif
+
+#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
+#endif
+
+#if !defined (count_trailing_zeros)
+#if !defined (COUNT_LEADING_ZEROS_SLOW)
+/* Define count_trailing_zeros using an asm count_leading_zeros.  */
+#define count_trailing_zeros(count, x)                                 \
+  do {                                                                 \
+    UWtype __ctz_x = (x);                                              \
+    UWtype __ctz_c;                                                    \
+    ASSERT (__ctz_x != 0);                                             \
+    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                 \
+    (count) = W_TYPE_SIZE - 1 - __ctz_c;                               \
+  } while (0)
+#else
+/* Define count_trailing_zeros in plain C, assuming small counts are common.
+   We use clz_tab without ado, since the C count_leading_zeros above will have
+   pulled it in.  */
+#define count_trailing_zeros(count, x)                                 \
+  do {                                                                 \
+    UWtype __ctz_x = (x);                                              \
+    int __ctz_c;                                                       \
+                                                                       \
+    if (LIKELY ((__ctz_x & 0xff) != 0))                                        
\
+      (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                     \
+    else                                                               \
+      {                                                                        
\
+       for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
+         {                                                             \
+           __ctz_x >>= 8;                                              \
+           if (LIKELY ((__ctz_x & 0xff) != 0))                         \
+             break;                                                    \
+         }                                                             \
+                                                                       \
+       (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
+      }                                                                        
\
+  } while (0)
+#endif
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
+
+/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
+   that hence the latter should always be used.  */
+#ifndef UDIV_PREINV_ALWAYS
+#define UDIV_PREINV_ALWAYS 0
+#endif
+
+/* Give defaults for UMUL_TIME and UDIV_TIME.  */
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
-- 
1.8.0.rc0.18.gf84667d


>From 49f5c21fff9241c195d74101a334fdc2c8dc33e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <address@hidden>
Date: Wed, 19 Sep 2012 16:20:11 +0200
Subject: [PATCH 6/8] factor: more improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/factor-ng.c: Import some improvements from
http://gmplib.org:8000/factoring
Co-authored-by: Torbjörn Granlund <address@hidden>
---
 src/factor-ng.c | 124 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 66 insertions(+), 58 deletions(-)

diff --git a/src/factor-ng.c b/src/factor-ng.c
index 776aaa6..36be5e6 100644
--- a/src/factor-ng.c
+++ b/src/factor-ng.c
@@ -1743,8 +1743,10 @@ is_square (uintmax_t x)
   return 0;
 }

-static const unsigned short invtab[] =
+/* invtab[i] = floor(0x10000 / (0x100 + i) */
+static const unsigned short invtab[0x81] =
   {
+    0x200,
     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
@@ -1767,17 +1769,22 @@ static const unsigned short invtab[] =
    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
 #define div_smallq(q, r, u, d)                                          \
   do {                                                                  \
-    if (0 && (u) / 0x40 < (d))                                          \
+    if ((u) / 0x40 < (d))                                               \
       {                                                                 \
         int _cnt;                                                       \
         uintmax_t _dinv, _mask, _q, _r;                                 \
         count_leading_zeros (_cnt, (d));                                \
-                                                                        \
-        _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt))                \
-                       - (1 << (8 - 1))];                               \
-                                                                        \
         _r = (u);                                                       \
-        _q = _r * _dinv >> (W_TYPE_SIZE + 8 - _cnt);                    \
+        if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
+          {                                                             \
+            _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
+            _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
+          }                                                             \
+        else                                                            \
+          {                                                             \
+            _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
+            _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
+          }                                                             \
         _r -= _q*(d);                                                   \
                                                                         \
         _mask = -(uintmax_t) (_r >= (d));                               \
@@ -1837,8 +1844,9 @@ static unsigned int q_freq[Q_FREQ_SIZE + 1];
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif

-
-static void
+/* Return true on success.  Expected to fail only for numbers
+   >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
+static bool
 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
 {
   /* Uses algorithm and notation from
@@ -1858,35 +1866,41 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)
       1155, 15, 231, 35, 3, 55, 7, 11, 0
     };

-  uintmax_t S;
   const unsigned int *m;

   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];

-  S = isqrt2 (n1, n0);
+  if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
+    return false;
+
+  uintmax_t sqrt_n = isqrt2 (n1, n0);

-  if (n0 == S * S)
+  if (n0 == sqrt_n * sqrt_n)
     {
       uintmax_t p1, p0;

-      umul_ppmm (p1, p0, S, S);
+      umul_ppmm (p1, p0, sqrt_n, sqrt_n);
       assert (p0 == n0);

       if (n1 == p1)
         {
-          if (prime_p (S))
-            factor_insert_multiplicity (factors, S, 2);
+          if (prime_p (sqrt_n))
+            factor_insert_multiplicity (factors, sqrt_n, 2);
           else
             {
               struct factors f;

               f.nfactors = 0;
-              factor_using_squfof (0, S, &f);
+              if (!factor_using_squfof (0, sqrt_n, &f))
+                {
+                  /* Try pollard rho instead */
+                  factor_using_pollard_rho (sqrt_n, 1, &f);
+                }
               /* Duplicate the new factors */
               for (unsigned int i = 0; i < f.nfactors; i++)
                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
             }
-          return;
+          return true;
         }
     }

@@ -1925,6 +1939,7 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)
       Dh += n1 * mu;

       assert (Dl % 4 != 1);
+      assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));

       S = isqrt2 (Dh, Dl);

@@ -1943,10 +1958,10 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)

       for (i = 0; i <= B; i++)
         {
-          uintmax_t q, P1, t, r;
+          uintmax_t q, P1, t, rem;

-          div_smallq (q, r, S+P, Q);
-          P1 = S - r;   /* P1 = q*Q - P */
+          div_smallq (q, rem, S+P, Q);
+          P1 = S - rem; /* P1 = q*Q - P */

 #if STAT_SQUFOF
           assert (q > 0);
@@ -2025,29 +2040,25 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)
                      for the case D = 2N. */
                   /* Compute Q = (D - P*P) / Q1, but we need double
                      precision. */
-                  {
-                    uintmax_t hi, lo, rem;
-                    umul_ppmm (hi, lo, P, P);
-                    sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
-                    udiv_qrnnd (Q, rem, hi, lo, Q1);
-                    assert (rem == 0);
-                  }
+                  uintmax_t hi, lo;
+                  umul_ppmm (hi, lo, P, P);
+                  sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
+                  udiv_qrnnd (Q, rem, hi, lo, Q1);
+                  assert (rem == 0);

                   for (;;)
                     {
-                      uintmax_t r;
-
                       /* Note: There appears to by a typo in the paper,
                          Step 4a in the algorithm description says q <--
                          floor([S+P]/\hat Q), but looking at the equations
                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
                          (In this code, \hat Q is Q1). */
-                      div_smallq (q, r, S+P, Q);
-                      P1 = S - r;       /* P1 = q*Q - P */
+                      div_smallq (q, rem, S+P, Q);
+                      P1 = S - rem;     /* P1 = q*Q - P */

 #if STAT_SQUFOF
                       q_freq[0]++;
-                      q_freq[MIN(q, Q_FREQ_SIZE)]++;
+                      q_freq[MIN (q, Q_FREQ_SIZE)]++;
 #endif
                       if (P == P1)
                         break;
@@ -2065,8 +2076,8 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)

                   if (prime_p (Q))
                     factor_insert (factors, Q);
-                  else
-                    factor_using_squfof (0, Q, factors);
+                  else if (!factor_using_squfof (0, Q, factors))
+                    factor_using_pollard_rho (Q, 2, factors);

                   divexact_21 (n1, n0, n1, n0, Q);

@@ -2074,13 +2085,16 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)
                     factor_insert_large (factors, n1, n0);
                   else
                     {
-                      if (n1 == 0)
-                        factor_using_pollard_rho (n0, 1, factors);
-                      else
-                        factor_using_squfof (n1, n0, factors);
+                      if (!factor_using_squfof (n1, n0, factors))
+                        {
+                          if (n1 == 0)
+                            factor_using_pollard_rho (n0, 1, factors);
+                          else
+                            factor_using_pollard_rho2 (n1, n0, 1, factors);
+                        }
                     }

-                  return;
+                  return true;
                 }
             }
         next_i:
@@ -2089,8 +2103,7 @@ factor_using_squfof (uintmax_t n1, uintmax_t n0, struct 
factors *factors)
     next_multiplier:
       ;
     }
-  fprintf (stderr, "squfof failed.\n");
-  exit (EXIT_FAILURE);
+  return false;
 }

 static void
@@ -2104,26 +2117,21 @@ factor (uintmax_t t1, uintmax_t t0, struct factors 
*factors)

   t0 = factor_using_division (&t1, t1, t0, factors);

-  if (t1 == 0)
-    {
-      if (t0 != 1)
-        {
-          if (prime_p (t0))
-            factor_insert (factors, t0);
-          else if (alg == ALG_POLLARD_RHO)
-            factor_using_pollard_rho (t0, 1, factors);
-          else
-            factor_using_squfof (0, t0, factors);
-        }
-    }
+  if (t1 == 0 && t0 < 2)
+    return;
+
+  if (prime2_p (t1, t0))
+    factor_insert_large (factors, t1, t0);
   else
     {
-      if (prime2_p (t1, t0))
-        factor_insert_large (factors, t1, t0);
-      else if (alg == ALG_POLLARD_RHO)
-        factor_using_pollard_rho2 (t1, t0, 1, factors);
+      if (alg == ALG_SQUFOF)
+        if (factor_using_squfof (t1, t0, factors))
+          return;
+
+      if (t1 == 0)
+        factor_using_pollard_rho (t0, 1, factors);
       else
-        factor_using_squfof (t1, t0, factors);
+        factor_using_pollard_rho2 (t1, t0, 1, factors);
     }
 }

-- 
1.8.0.rc0.18.gf84667d


>From 759ebcb57db73449b5670204f85931d34881b7d2 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Sun, 16 Sep 2012 22:31:04 +0200
Subject: [PATCH 7/8] factor: merge with preexisting factor; integrate tests;
 avoid warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/factor.c: Renamed from factor-ng.c, with the following changes:
Adjust copyright header to be consistent with others.
Use xmalloc and xrealloc, to avoid segv upon OOM.
Switch back to using readtokens to handle input.
Diagnose invalid inputs.
s/fprintf+exit/error/
(print_factors): Add comments.
(strto2uintmax): Return strtol_error, not int.
(read_item): Remove, no longer used.
(main): Use atexit(close_stdout) so that we don't ignore failed write.
* cfg.mk: Exempt src/longlong.h from several tests.
Exempt run.sh from the test-list-consistency test.
Exempt make-prime-list.c from numerous tests, since we won't
be making it conform: it must not link with libcoreutils.a.
Exempt factor-ng.c from the no-upper-case error message test.
* AUTHORS (factor): Add Torbjörn and Niels.
* tests/local.mk (factor_tests): Encode the 37 tests.
($(factor_tests)): Rule to generate a test script for each test.
* tests/factor/run.sh: New script, marked as very expensive.
* .gitignore: Ignore new generated files.
* src/local.mk (src/primes.h): New rule.
(noinst_PROGRAMS): Add make-prime-list.
(noinst_HEADERS): Add longlong.h.
Remove all wheel-related rules and files.
* src/wheel-gen.pl: Remove file.

maint: mark set-but-not-used variables with ATTRIBUTE_UNUSED
* src/factor-ng.c (redcify, prime_p, isqrt2): Mark them, so we
don't have to disable -Wunused-but-set-variable.

maint: use __builtin_expect only if __GNUC__
* src/factor-ng.c (LIKELY, UNLIKELY) [__GNUC__]: Add #ifdef guard.

build: avoid warning about unused macro
* src/factor-ng.c (__GMP_DECLSPEC): Don't define here
* src/longlong.h (__GMP_DECLSPEC): Define if not already defined.
---
 .gitignore          |    6 +-
 AUTHORS             |    2 +-
 cfg.mk              |   23 +-
 src/.gitignore      |    2 -
 src/factor-ng.c     | 2396 ------------------------------------------------
 src/factor.c        | 2497 ++++++++++++++++++++++++++++++++++++++++++++-------
 src/local.mk        |   31 +-
 src/longlong.h      |    6 +-
 src/wheel-gen.pl    |  114 ---
 tests/factor/run.sh |   30 +
 tests/local.mk      |   71 +-
 11 files changed, 2327 insertions(+), 2851 deletions(-)
 delete mode 100644 src/factor-ng.c
 delete mode 100755 src/wheel-gen.pl
 create mode 100755 tests/factor/run.sh

diff --git a/.gitignore b/.gitignore
index 56a9bf4..8a423f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,7 +37,7 @@
 /coreutils-*.tar.xz
 /coreutils-*.tar.xz.sig
 /gnulib-tests
-/lib/.cvsignore
+/lib/.dirstamp
 /lib/.gitignore
 /lib/alloca.h
 /lib/arg-nonnull.h
@@ -74,6 +74,7 @@
 /lib/ref-del.sed
 /lib/selinux
 /lib/signal.h
+/lib/spawn.h
 /lib/stamp-h1
 /lib/stdalign.h
 /lib/stdio.h
@@ -152,12 +153,15 @@
 /po/remove-potcdate.sin
 /po/stamp-po
 /src/cu-progs.mk
+/src/make-prime-list
+/src/primes.h
 /src/version.c
 /src/version.h
 /stamp-h1
 /tests/*/*.log
 /tests/*/*.trs
 /tests/.built-programs
+/tests/factor/[0-9]*.sh
 /tests/t?
 /tests/test-suite.log
 ID
diff --git a/AUTHORS b/AUTHORS
index 9984a2e..552e9d4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -26,7 +26,7 @@ echo: Brian Fox, Chet Ramey
 env: Richard Mlynarik, David MacKenzie
 expand: David MacKenzie
 expr: Mike Parker, James Youngman, Paul Eggert
-factor: Paul Rubin
+factor: Paul Rubin, Torbjörn Granlund, Niels Möller
 false: Jim Meyering
 fmt: Ross Paterson
 fold: David MacKenzie
diff --git a/cfg.mk b/cfg.mk
index 079ab5b..1020977 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -111,7 +111,7 @@ sc_tests_list_consistency:
          for t in $(all_tests); do echo $$t; done;                     \
          cd $(top_srcdir);                                             \
          $(SHELL) build-aux/vc-list-files tests                        \
-           | grep -v '^tests/init\.sh$$'                               \
+           | grep -Ev '^tests/(factor/run|init)\.sh$$'                 \
            | $(EGREP) "$$test_extensions_rx\$$";                       \
        } | sort | uniq -u | grep . && exit 1; :

@@ -515,10 +515,11 @@ update-copyright-env = \
 # List syntax-check exemptions.
 exclude_file_name_regexp--sc_space_tab = \
   ^(tests/pr/|tests/misc/nl\.sh$$|gl/.*\.diff$$)
-exclude_file_name_regexp--sc_bindtextdomain = ^(gl/.*|lib/euidaccess-stat)\.c$$
+exclude_file_name_regexp--sc_bindtextdomain = \
+  ^(gl/.*|lib/euidaccess-stat|src/make-prime-list)\.c$$
 exclude_file_name_regexp--sc_trailing_blank = ^tests/pr/
 exclude_file_name_regexp--sc_system_h_headers = \
-  ^src/((system|copy)\.h|libstdbuf\.c)$$
+  ^src/((system|copy)\.h|libstdbuf\.c|make-prime-list\.c)$$

 _src = (false|lbracket|ls-(dir|ls|vdir)|tac-pipe|uname-(arch|uname))
 exclude_file_name_regexp--sc_require_config_h_first = \
@@ -530,7 +531,8 @@ exclude_file_name_regexp--sc_po_check = ^gl/
 exclude_file_name_regexp--sc_prohibit_always-defined_macros = \
   ^src/(seq|remove)\.c$$
 exclude_file_name_regexp--sc_prohibit_empty_lines_at_EOF = ^tests/pr/
-exclude_file_name_regexp--sc_program_name = ^(gl/.*|lib/euidaccess-stat)\.c$$
+exclude_file_name_regexp--sc_program_name = \
+  ^(gl/.*|lib/euidaccess-stat|src/make-prime-list)\.c$$
 exclude_file_name_regexp--sc_file_system = \
   NEWS|^(init\.cfg|src/df\.c|tests/df/df-P\.sh)$$
 exclude_file_name_regexp--sc_prohibit_always_true_header_tests = \
@@ -539,14 +541,20 @@ exclude_file_name_regexp--sc_prohibit_fail_0 = \
   (^.*/git-hooks/commit-msg|^tests/init\.sh|Makefile\.am|\.mk|.*\.texi)$$
 exclude_file_name_regexp--sc_prohibit_atoi_atof = ^lib/euidaccess-stat\.c$$

+# longlong.h is maintained elsewhere.
+_ll = ^src/longlong\.h$$
+exclude_file_name_regexp--sc_useless_cpp_parens = $(_ll)
+exclude_file_name_regexp--sc_long_lines = $(_ll)
+exclude_file_name_regexp--sc_space_before_open_paren = $(_ll)
+
 tbi_1 = ^tests/pr/|(^gl/lib/reg.*\.c\.diff|\.mk|^man/help2man)$$
 tbi_2 = ^scripts/git-hooks/(pre-commit|pre-applypatch|applypatch-msg)$$
-tbi_3 = (GNU)?[Mm]akefile(\.am)?$$
+tbi_3 = (GNU)?[Mm]akefile(\.am)?$$|$(_ll)
 exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
   $(tbi_1)|$(tbi_2)|$(tbi_3)

 exclude_file_name_regexp--sc_preprocessor_indentation = \
-  ^(gl/lib/rand-isaac\.[ch]|gl/tests/test-rand-isaac\.c)$$
+  ^(gl/lib/rand-isaac\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(__ll)
 exclude_file_name_regexp--sc_prohibit_stat_st_blocks = \
   ^(src/system\.h|tests/du/2g\.sh)$$

@@ -560,6 +568,9 @@ exclude_file_name_regexp--sc_prohibit_test_backticks = \
 exclude_file_name_regexp--sc_prohibit_operator_at_end_of_line = \
   ^src/(ptx|test|head)\.c$$

+exclude_file_name_regexp--sc_error_message_uppercase = ^src/factor\.c$$
+exclude_file_name_regexp--sc_prohibit_atoi_atof = ^src/make-prime-list\.c$$
+
 # Augment AM_CFLAGS to include our per-directory options:
 AM_CFLAGS += $($(@D)_CFLAGS)

diff --git a/src/.gitignore b/src/.gitignore
index 1c1edbb..18cccc1 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -109,8 +109,6 @@ uptime
 users
 vdir
 wc
-wheel-size.h
-wheel.h
 who
 whoami
 yes
diff --git a/src/factor-ng.c b/src/factor-ng.c
deleted file mode 100644
index 36be5e6..0000000
--- a/src/factor-ng.c
+++ /dev/null
@@ -1,2396 +0,0 @@
-/* Factoring of uintmax_t numbers.
-
-   Contributed to the GNU project by Torbjörn Granlund and Niels Möller
-   Contains code from GNU MP.
-
-Copyright 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2005, 2009, 2012
-Free Software Foundation, Inc.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; either version 3 of the License, or (at your option) any later
-version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program.  If not, see http://www.gnu.org/licenses/.  */
-
-/* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
-   or, with GMP, numbers of any size.
-
-  Code organisation:
-
-    There are several variants of many functions, for handling one word, two
-    words, and GMP's mpz_t type.  If the one-word variant is called foo, the
-    two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
-    some cases, the plain function variants will handle both one-word and
-    two-word numbers, evidenced by function arguments.
-
-    The factoring code for two words will fall into the code for one word when
-    progress allows that.
-
-    Using GMP is optional.  Define HAVE_GMP to make this code include GMP
-    factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
-    (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
-    factoring code even if numbers get small enough for using the two-word
-    code.
-
-  Algorithm:
-
-    (1) Perform trial division using a small primes table, but without hardware
-        division since the primes table store inverses modulo the word base.
-        (The GMP variant of this code doesn't make use of the precomputed
-        inverses, but instead relies on GMP for fast divisibility testing.)
-    (2) Check the nature of any non-factored part using Miller-Rabin for
-        detecting composites, and Lucas for detecting primes.
-    (3) Factor any remaining composite part using the Pollard-Brent rho
-        algorithm or the SQUFOF algorithm, checking status of found factors
-        again using Miller-Rabin and Lucas.
-
-    We prefer using Hensel norm in the divisions, not the more familiar
-    Euclidian norm, since the former leads to much faster code.  In the
-    Pollard-Brent rho code and the the prime testing code, we use Montgomery's
-    trick of multiplying all n-residues by the word base, allowing cheap Hensel
-    reductions mod n.
-
-  Improvements:
-
-    * Use modular inverses also for exact division in the Lucas code, and
-      elsewhere.  A problem is to locate the inverses not from an index, but
-      from a prime.  We might instead compute the inverse on-the-fly.
-
-    * Tune trial division table size (not forgetting that this is a standalone
-      program where the table will be read from disk for each invocation).
-
-    * Implement less naive powm, using k-ary exponentiation for k = 3 or
-      perhaps k = 4.
-
-    * Try to speed trial division code for single uintmax_t numbers, i.e., the
-      code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
-      IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
-      using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
-      respectively cycles ought to be possible.
-
-    * The redcify function could be vastly improved by using (plain Euclidian)
-      pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
-      GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
-      similar methoods.  These functions currently dominate run time when using
-      the -w option.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <assert.h>
-#include <ctype.h>
-#include <string.h>             /* for memmove */
-#include <unistd.h>             /* for getopt */
-#include <stdbool.h>
-
-#include "system.h"
-
-#if HAVE_GMP
-# include <gmp.h>
-#endif
-
-#ifndef STAT_SQUFOF
-# define STAT_SQUFOF 0
-#endif
-
-#ifndef USE_LONGLONG_H
-# define USE_LONGLONG_H 1
-#endif
-
-#if USE_LONGLONG_H
-
-/* Make definitions for longlong.h to make it do what it can do for us */
-# define W_TYPE_SIZE 64          /* bitcount for uintmax_t */
-# define UWtype  uintmax_t
-# define UHWtype unsigned long int
-# undef UDWtype
-# if HAVE_ATTRIBUTE_MODE
-typedef unsigned int UQItype    __attribute__ ((mode (QI)));
-typedef          int SItype     __attribute__ ((mode (SI)));
-typedef unsigned int USItype    __attribute__ ((mode (SI)));
-typedef          int DItype     __attribute__ ((mode (DI)));
-typedef unsigned int UDItype    __attribute__ ((mode (DI)));
-# else
-typedef unsigned char UQItype;
-typedef          long SItype;
-typedef unsigned long int USItype;
-#  if HAVE_LONG_LONG
-typedef long long int DItype;
-typedef unsigned long long int UDItype;
-#  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
-typedef long int DItype;
-typedef unsigned long int UDItype;
-#  endif
-# endif
-# define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files 
*/
-# define ASSERT(x)               /* FIXME make longlong.h really standalone */
-# define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
-# define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
-# ifndef __GMP_GNUC_PREREQ
-#  define __GMP_GNUC_PREREQ(a,b) 1
-# endif
-# if _ARCH_PPC
-#  define HAVE_HOST_CPU_FAMILY_powerpc 1
-# endif
-# include "longlong.h"
-# ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
-const unsigned char factor_clz_tab[129] =
-{
-  1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-  9
-};
-# endif
-
-#else /* not USE_LONGLONG_H */
-
-# define W_TYPE_SIZE (8 * sizeof (uintmax_t))
-# define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
-# define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
-# define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
-
-#endif
-
-enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
-
-static enum alg_type alg;
-
-/* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
-#define MAX_NFACTS 26
-
-struct factors
-{
-  uintmax_t     plarge[2]; /* Can have a single large factor */
-  uintmax_t     p[MAX_NFACTS];
-  unsigned char e[MAX_NFACTS];
-  unsigned char nfactors;
-};
-
-#if HAVE_GMP
-struct mp_factors
-{
-  mpz_t         *p;
-  unsigned long int *e;
-  unsigned long int nfactors;
-};
-#endif
-
-static void factor (uintmax_t, uintmax_t, struct factors *);
-
-#ifndef umul_ppmm
-# define umul_ppmm(w1, w0, u, v)                                         \
-  do {                                                                  \
-    uintmax_t __x0, __x1, __x2, __x3;                                   \
-    unsigned long int __ul, __vl, __uh, __vh;                           \
-    uintmax_t __u = (u), __v = (v);                                     \
-                                                                        \
-    __ul = __ll_lowpart (__u);                                          \
-    __uh = __ll_highpart (__u);                                         \
-    __vl = __ll_lowpart (__v);                                          \
-    __vh = __ll_highpart (__v);                                         \
-                                                                        \
-    __x0 = (uintmax_t) __ul * __vl;                                     \
-    __x1 = (uintmax_t) __ul * __vh;                                     \
-    __x2 = (uintmax_t) __uh * __vl;                                     \
-    __x3 = (uintmax_t) __uh * __vh;                                     \
-                                                                        \
-    __x1 += __ll_highpart (__x0);/* this can't give carry */            \
-    __x1 += __x2;               /* but this indeed can */               \
-    if (__x1 < __x2)            /* did we get it? */                    \
-      __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
-                                                                        \
-    (w1) = __x3 + __ll_highpart (__x1);                                 \
-    (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
-  } while (0)
-#endif
-
-#if !defined(udiv_qrnnd) || defined(UDIV_NEEDS_NORMALIZATION)
-/* Define our own, not needing normalization. This function is
-   currently not performance critical, so keep it simple. Similar to
-   the mod macro below. */
-# undef udiv_qrnnd
-# define udiv_qrnnd(q, r, n1, n0, d)                                     \
-  do {                                                                  \
-    uintmax_t __d1, __d0, __q, __r1, __r0;                              \
-                                                                        \
-    assert ((n1) < (d));                                                \
-    __d1 = (d); __d0 = 0;                                               \
-    __r1 = (n1); __r0 = (n0);                                           \
-    __q = 0;                                                            \
-    for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
-      {                                                                 \
-        rsh2 (__d1, __d0, __d1, __d0, 1);                               \
-        __q <<= 1;                                                      \
-        if (ge2 (__r1, __r0, __d1, __d0))                               \
-          {                                                             \
-            __q++;                                                      \
-            sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
-          }                                                             \
-      }                                                                 \
-    (r) = __r0;                                                         \
-    (q) = __q;                                                          \
-  } while (0)
-#endif
-
-#if !defined (add_ssaaaa)
-# define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
-  do {                                                                  \
-    uintmax_t _add_x;                                                   \
-    _add_x = (al) + (bl);                                               \
-    (sh) = (ah) + (bh) + (_add_x < (al));                               \
-    (sl) = _add_x;                                                      \
-  } while (0)
-#endif
-
-#define rsh2(rh, rl, ah, al, cnt)                                       \
-  do {                                                                  \
-    (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
-    (rh) = (ah) >> (cnt);                                               \
-  } while (0)
-
-#define lsh2(rh, rl, ah, al, cnt)                                       \
-  do {                                                                  \
-    (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
-    (rl) = (al) << (cnt);                                               \
-  } while (0)
-
-#define ge2(ah, al, bh, bl)                                             \
-  ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
-
-#define gt2(ah, al, bh, bl)                                             \
-  ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
-
-#ifndef sub_ddmmss
-# define sub_ddmmss(rh, rl, ah, al, bh, bl)                              \
-  do {                                                                  \
-    uintmax_t _cy;                                                      \
-    _cy = (al) < (bl);                                                  \
-    (rl) = (al) - (bl);                                                 \
-    (rh) = (ah) - (bh) - _cy;                                           \
-  } while (0)
-#endif
-
-#ifndef count_leading_zeros
-# define count_leading_zeros(count, x) do {                              \
-    uintmax_t __clz_x = (x);                                            \
-    unsigned int __clz_c;                                               \
-    for (__clz_c = 0;                                                   \
-         (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
-         __clz_c += 8)                                                  \
-      __clz_x <<= 8;                                                    \
-    for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
-      __clz_x <<= 1;                                                    \
-    (count) = __clz_c;                                                  \
-  } while (0)
-#endif
-
-#ifndef count_trailing_zeros
-# define count_trailing_zeros(count, x) do {                             \
-    uintmax_t __ctz_x = (x);                                            \
-    unsigned int __ctz_c = 0;                                           \
-    while ((__ctz_x & 1) == 0)                                          \
-      {                                                                 \
-        __ctz_x >>= 1;                                                  \
-        __ctz_c++;                                                      \
-      }                                                                 \
-    (count) = __ctz_c;                                                  \
-  } while (0)
-#endif
-
-/* Requires that a < n and b <= n */
-#define submod(r,a,b,n)                                                 \
-  do {                                                                  \
-    uintmax_t _t = - (uintmax_t) (a < b);                               \
-    (r) = ((n) & _t) + (a) - (b);                                       \
-  } while (0)
-
-#define addmod(r,a,b,n)                                                 \
-  submod((r), (a), ((n) - (b)), (n))
-
-/* Modular two-word addition and subtraction.  For performance reasons, the
-   most significant bit of n1 must be clear.  The destination variables must be
-   distinct from the mod operand.  */
-#define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
-  do {                                                                  \
-    add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
-    if (ge2 ((r1), (r0), (n1), (n0)))                                   \
-      sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
-  } while (0)
-#define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
-  do {                                                                  \
-    sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
-    if ((intmax_t) (r1) < 0)                                            \
-      add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
-  } while (0)
-
-#define HIGHBIT_TO_MASK(x)                                              \
-  (((intmax_t)-1 >> 1) < 0                                              \
-   ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
-   : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
-      ? UINTMAX_MAX : (uintmax_t) 0))
-
-/* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
-   Requires that d1 != 0.  */
-static uintmax_t
-mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
-{
-  int cntd, cnta;
-
-  assert (d1 != 0);
-
-  if (a1 == 0)
-    {
-      *r1 = 0;
-      return a0;
-    }
-
-  count_leading_zeros (cntd, d1);
-  count_leading_zeros (cnta, a1);
-  int cnt = cntd - cnta;
-  lsh2 (d1, d0, d1, d0, cnt);
-  for (int i = 0; i < cnt; i++)
-    {
-      if (ge2 (a1, a0, d1, d0))
-        sub_ddmmss (a1, a0, a1, a0, d1, d0);
-      rsh2 (d1, d0, d1, d0, 1);
-    }
-
-  *r1 = a1;
-  return a0;
-}
-
-static uintmax_t _GL_ATTRIBUTE_CONST
-gcd_odd (uintmax_t a, uintmax_t b)
-{
-  if ( (b & 1) == 0)
-    {
-      uintmax_t t = b;
-      b = a;
-      a = t;
-    }
-  if (a == 0)
-    return b;
-
-  /* Take out least significant one bit, to make room for sign */
-  b >>= 1;
-
-  for (;;)
-    {
-      uintmax_t t;
-      uintmax_t bgta;
-
-      while ((a & 1) == 0)
-        a >>= 1;
-      a >>= 1;
-
-      t = a - b;
-      if (t == 0)
-        return (a << 1) + 1;
-
-      bgta = HIGHBIT_TO_MASK (t);
-
-      /* b <-- min (a, b) */
-      b += (bgta & t);
-
-      /* a <-- |a - b| */
-      a = (t ^ bgta) - bgta;
-    }
-}
-
-static uintmax_t
-gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t 
b0)
-{
-  while ((a0 & 1) == 0)
-    rsh2 (a1, a0, a1, a0, 1);
-  while ((b0 & 1) == 0)
-    rsh2 (b1, b0, b1, b0, 1);
-
-  for (;;)
-    {
-      if ((b1 | a1) == 0)
-        {
-          *r1 = 0;
-          return gcd_odd (b0, a0);
-        }
-
-      if (gt2 (a1, a0, b1, b0))
-        {
-          sub_ddmmss (a1, a0, a1, a0, b1, b0);
-          do
-            rsh2 (a1, a0, a1, a0, 1);
-          while ((a0 & 1) == 0);
-        }
-      else if (gt2 (b1, b0, a1, a0))
-        {
-          sub_ddmmss (b1, b0, b1, b0, a1, a0);
-          do
-            rsh2 (b1, b0, b1, b0, 1);
-          while ((b0 & 1) == 0);
-        }
-      else
-        break;
-    }
-
-  *r1 = a1;
-  return a0;
-}
-
-static void
-factor_insert_multiplicity (struct factors *factors,
-                            uintmax_t prime, unsigned int m)
-{
-  unsigned int nfactors = factors->nfactors;
-  uintmax_t *p = factors->p;
-  unsigned char *e = factors->e;
-
-  /* Locate position for insert new or increment e.  */
-  int i;
-  for (i = nfactors - 1; i >= 0; i--)
-    {
-      if (p[i] <= prime)
-        break;
-    }
-
-  if (i < 0 || p[i] != prime)
-    {
-      for (int j = nfactors - 1; j > i; j--)
-        {
-          p[j + 1] = p[j];
-          e[j + 1] = e[j];
-        }
-      p[i + 1] = prime;
-      e[i + 1] = m;
-      factors->nfactors = nfactors + 1;
-    }
-  else
-    {
-      e[i] += m;
-    }
-}
-
-#define factor_insert(f, p) factor_insert_multiplicity(f, p, 1)
-
-static void
-factor_insert_large (struct factors *factors,
-                     uintmax_t p1, uintmax_t p0)
-{
-  if (p1 > 0)
-    {
-      assert (factors->plarge[1] == 0);
-      factors->plarge[0] = p0;
-      factors->plarge[1] = p1;
-    }
-  else
-    factor_insert (factors, p0);
-}
-
-#if HAVE_GMP
-static void mp_factor (mpz_t, struct mp_factors *);
-
-static void
-mp_factor_init (struct mp_factors *factors)
-{
-  factors->p = malloc (1);
-  factors->e = malloc (1);
-  factors->nfactors = 0;
-}
-
-static void
-mp_factor_clear (struct mp_factors *factors)
-{
-  for (unsigned int i = 0; i < factors->nfactors; i++)
-    mpz_clear (factors->p[i]);
-
-  free (factors->p);
-  free (factors->e);
-}
-
-static void
-mp_factor_insert (struct mp_factors *factors, mpz_t prime)
-{
-  unsigned long int nfactors = factors->nfactors;
-  mpz_t         *p  = factors->p;
-  unsigned long int *e  = factors->e;
-  long i;
-
-  /* Locate position for insert new or increment e.  */
-  for (i = nfactors - 1; i >= 0; i--)
-    {
-      if (mpz_cmp (p[i], prime) <= 0)
-        break;
-    }
-
-  if (i < 0 || mpz_cmp (p[i], prime) != 0)
-    {
-      p = realloc (p, (nfactors + 1) * sizeof p[0]);
-      e = realloc (e, (nfactors + 1) * sizeof e[0]);
-
-      mpz_init (p[nfactors]);
-      for (long j = nfactors - 1; j > i; j--)
-        {
-          mpz_set (p[j + 1], p[j]);
-          e[j + 1] = e[j];
-        }
-      mpz_set (p[i + 1], prime);
-      e[i + 1] = 1;
-
-      factors->p = p;
-      factors->e = e;
-      factors->nfactors = nfactors + 1;
-    }
-  else
-    {
-      e[i] += 1;
-    }
-}
-
-static void
-mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
-{
-  mpz_t pz;
-
-  mpz_init_set_ui (pz, prime);
-  mp_factor_insert (factors, pz);
-  mpz_clear (pz);
-}
-#endif /* HAVE_GMP */
-
-
-#define P(a,b,c,d) a,
-static const unsigned char primes_diff[] = {
-#include "primes.h"
-0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
-};
-#undef P
-
-#define PRIMES_PTAB_ENTRIES (sizeof(primes_diff) / sizeof(primes_diff[0]) - 8 
+ 1)
-
-#define P(a,b,c,d) b,
-static const unsigned char primes_diff8[] = {
-#include "primes.h"
-0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
-};
-#undef P
-
-struct primes_dtab
-{
-  uintmax_t binv, lim;
-};
-
-#define P(a,b,c,d) {c,d},
-static const struct primes_dtab primes_dtab[] = {
-#include "primes.h"
-{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
-};
-#undef P
-
-/* This flag is honoured just in the GMP code. */
-static int flag_verbose = 0;
-
-/* Prove primality or run probabilistic tests.  */
-static int flag_prove_primality = 1;
-
-/* Number of Miller-Rabin tests to run when not proving primality. */
-#define MR_REPS 25
-
-#define LIKELY(cond)    __builtin_expect ((cond) != 0, 1)
-#define UNLIKELY(cond)  __builtin_expect ((cond) != 0, 0)
-
-static void
-factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
-                      unsigned int off)
-{
-  for (unsigned int j = 0; j < off; j++)
-    p += primes_diff[i + j];
-  factor_insert (factors, p);
-}
-
-/* Trial division with odd primes uses the following trick.
-
-   Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
-   consider the case t < B (this is the second loop below).
-
-   From our tables we get
-
-     binv = p^{-1} (mod B)
-     lim = floor ( (B-1) / p ).
-
-   First assume that t is a multiple of p, t = q * p. Then 0 <= q <=
-   lim (and all quotients in this range occur for some t).
-
-   Then t = q * p is true also (mod B), and p is invertible we get
-
-     q = t * binv (mod B).
-
-   Next, assume that t is *not* divisible by p. Since multiplication
-   by binv (mod B) is a one-to-one mapping,
-
-     t * binv (mod B) > lim,
-
-   because all the smaller values are already taken.
-
-   This can be summed up by saying that the function
-
-     q(t) = binv * t (mod B)
-
-   is a permutation of the range 0 <= t < B, with the curious property
-   that it maps the multiples of p onto the range 0 <= q <= lim, in
-   order, and the non-multiples of p onto the range lim < q < B.
- */
-
-static uintmax_t
-factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
-                       struct factors *factors)
-{
-  if (t0 % 2 == 0)
-    {
-      unsigned int cnt;
-
-      if (t0 == 0)
-        {
-          count_trailing_zeros (cnt, t1);
-          t0 = t1 >> cnt;
-          t1 = 0;
-          cnt += W_TYPE_SIZE;
-        }
-      else
-        {
-          count_trailing_zeros (cnt, t0);
-          rsh2 (t1, t0, t1, t0, cnt);
-        }
-
-      factor_insert_multiplicity (factors, 2, cnt);
-    }
-
-  uintmax_t p = 3;
-  unsigned int i;
-  for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
-    {
-      for (;;)
-        {
-          uintmax_t q1, q0, hi, lo;
-
-          q0 = t0 * primes_dtab[i].binv;
-          umul_ppmm (hi, lo, q0, p);
-          if (hi > t1)
-            break;
-          hi = t1 - hi;
-          q1 = hi * primes_dtab[i].binv;
-          if (LIKELY (q1 > primes_dtab[i].lim))
-            break;
-          t1 = q1; t0 = q0;
-          factor_insert (factors, p);
-        }
-      p += primes_diff[i + 1];
-    }
-  if (t1p)
-    *t1p = t1;
-
-#define DIVBLOCK(I)                                                     \
-  do {                                                                  \
-    for (;;)                                                            \
-      {                                                                 \
-        q = t0 * pd[I].binv;                                            \
-        if (LIKELY (q > pd[I].lim))                                     \
-          break;                                                        \
-        t0 = q;                                                         \
-        factor_insert_refind (factors, p, i + 1, I);                    \
-      }                                                                 \
-  } while (0)
-
-  for (; i < PRIMES_PTAB_ENTRIES; i += 8)
-    {
-      uintmax_t q;
-      const struct primes_dtab *pd = &primes_dtab[i];
-      DIVBLOCK (0);
-      DIVBLOCK (1);
-      DIVBLOCK (2);
-      DIVBLOCK (3);
-      DIVBLOCK (4);
-      DIVBLOCK (5);
-      DIVBLOCK (6);
-      DIVBLOCK (7);
-
-      p += primes_diff8[i];
-      if (p * p > t0)
-        break;
-    }
-
-  return t0;
-}
-
-#if HAVE_GMP
-static void
-mp_factor_using_division (mpz_t t, struct mp_factors *factors)
-{
-  mpz_t q;
-  unsigned long int p;
-
-  if (flag_verbose > 0)
-    {
-      printf ("[trial division] ");
-    }
-
-  mpz_init (q);
-
-  p = mpz_scan1 (t, 0);
-  mpz_div_2exp (t, t, p);
-  while (p)
-    {
-      mp_factor_insert_ui (factors, 2);
-      --p;
-    }
-
-  p = 3;
-  for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
-    {
-      if (! mpz_divisible_ui_p (t, p))
-        {
-          p += primes_diff[i++];
-          if (mpz_cmp_ui (t, p * p) < 0)
-            break;
-        }
-      else
-        {
-          mpz_tdiv_q_ui (t, t, p);
-          mp_factor_insert_ui (factors, p);
-        }
-    }
-
-  mpz_clear (q);
-}
-#endif
-
-/* Entry i contains (2i+1)^(-1) mod 2^8.  */
-static const unsigned char  binvert_table[128] =
-{
-  0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
-  0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
-  0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
-  0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
-  0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
-  0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
-  0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
-  0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
-  0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
-  0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
-  0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
-  0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
-  0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
-  0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
-  0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
-  0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
-};
-
-/* Compute n^(-1) mod B, using a Newton iteration.  */
-#define binv(inv,n)                                                     \
-  do {                                                                  \
-    uintmax_t  __n = (n);                                               \
-    uintmax_t  __inv;                                                   \
-                                                                        \
-    __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
-    if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
-    if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
-    if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
-                                                                        \
-    if (W_TYPE_SIZE > 64)                                               \
-      {                                                                 \
-        int  __invbits = 64;                                            \
-        do {                                                            \
-          __inv = 2 * __inv - __inv * __inv * __n;                      \
-          __invbits *= 2;                                               \
-        } while (__invbits < W_TYPE_SIZE);                              \
-      }                                                                 \
-                                                                        \
-    (inv) = __inv;                                                      \
-  } while (0)
-
-/* q = u / d, assuming d|u.  */
-#define divexact_21(q1, q0, u1, u0, d)                                  \
-  do {                                                                  \
-    uintmax_t _di, _q0;                                                 \
-    binv (_di, (d));                                                    \
-    _q0 = (u0) * _di;                                                   \
-    if ((u1) >= (d))                                                    \
-      {                                                                 \
-        uintmax_t _p1, _p0;                                             \
-        umul_ppmm (_p1, _p0, _q0, d);                                   \
-        (q1) = ((u1) - _p1) * _di;                                      \
-        (q0) = _q0;                                                     \
-      }                                                                 \
-    else                                                                \
-      {                                                                 \
-        (q0) = _q0;                                                     \
-        (q1) = 0;                                                       \
-      }                                                                 \
-  } while(0)
-
-/* x B (mod n). */
-#define redcify(r_prim, r, n)                                           \
-  do {                                                                  \
-    uintmax_t _redcify_q;                                               \
-    udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
-  } while (0)
-
-/* x B^2 (mod n). Requires x > 0, n1 < B/2 */
-#define redcify2(r1, r0, x, n1, n0)                                     \
-  do {                                                                  \
-    uintmax_t _r1, _r0, _i;                                             \
-    if ((x) < (n1))                                                     \
-      {                                                                 \
-        _r1 = (x); _r0 = 0;                                             \
-        _i = W_TYPE_SIZE;                                               \
-      }                                                                 \
-    else                                                                \
-      {                                                                 \
-        _r1 = 0; _r0 = (x);                                             \
-        _i = 2*W_TYPE_SIZE;                                             \
-      }                                                                 \
-    while (_i-- > 0)                                                    \
-      {                                                                 \
-        lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
-        if (ge2 (_r1, _r0, (n1), (n0)))                                 \
-          sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
-      }                                                                 \
-    (r1) = _r1;                                                         \
-    (r0) = _r0;                                                         \
-  } while (0)
-
-/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
-   Both a and b must be in redc form, the result will be in redc form too. */
-static inline uintmax_t
-mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
-{
-  uintmax_t rh, rl, q, th, tl, xh;
-
-  umul_ppmm (rh, rl, a, b);
-  q = rl * mi;
-  umul_ppmm (th, tl, q, m);
-  xh = rh - th;
-  if (rh < th)
-    xh += m;
-
-  return xh;
-}
-
-/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
-   Both a and b must be in redc form, the result will be in redc form too.
-   For performance reasons, the most significant bit of m must be clear. */
-static uintmax_t
-mulredc2 (uintmax_t *r1p,
-          uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
-          uintmax_t m1, uintmax_t m0, uintmax_t mi)
-{
-  uintmax_t r1, r0, q, p1, p0, t1, t0, s1, s0;
-  mi = -mi;
-  assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
-  assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
-  assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
-
-  /* First compute a0 * <b1, b0> B^{-1}
-        +-----+
-        |a0 b0|
-     +--+--+--+
-     |a0 b1|
-     +--+--+--+
-        |q0 m0|
-     +--+--+--+
-     |q0 m1|
-    -+--+--+--+
-     |r1|r0| 0|
-     +--+--+--+
-  */
-  umul_ppmm (t1, t0, a0, b0);
-  umul_ppmm (r1, r0, a0, b1);
-  q = mi * t0;
-  umul_ppmm (p1, p0, q, m0);
-  umul_ppmm (s1, s0, q, m1);
-  r0 += (t0 != 0); /* Carry */
-  add_ssaaaa (r1, r0, r1, r0, 0, p1);
-  add_ssaaaa (r1, r0, r1, r0, 0, t1);
-  add_ssaaaa (r1, r0, r1, r0, s1, s0);
-
-  /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
-        +-----+
-        |a1 b0|
-        +--+--+
-        |r1|r0|
-     +--+--+--+
-     |a1 b1|
-     +--+--+--+
-        |q1 m0|
-     +--+--+--+
-     |q1 m1|
-    -+--+--+--+
-     |r1|r0| 0|
-     +--+--+--+
-  */
-  umul_ppmm (t1, t0, a1, b0);
-  umul_ppmm (s1, s0, a1, b1);
-  add_ssaaaa (t1, t0, t1, t0, 0, r0);
-  q = mi * t0;
-  add_ssaaaa (r1, r0, s1, s0, 0, r1);
-  umul_ppmm (p1, p0, q, m0);
-  umul_ppmm (s1, s0, q, m1);
-  r0 += (t0 != 0); /* Carry */
-  add_ssaaaa (r1, r0, r1, r0, 0, p1);
-  add_ssaaaa (r1, r0, r1, r0, 0, t1);
-  add_ssaaaa (r1, r0, r1, r0, s1, s0);
-
-  if (ge2 (r1, r0, m1, m0))
-    sub_ddmmss (r1, r0, r1, r0, m1, m0);
-
-  *r1p = r1;
-  return r0;
-}
-
-static uintmax_t _GL_ATTRIBUTE_CONST
-powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
-{
-  uintmax_t y = one;
-
-  if (e & 1)
-    y = b;
-
-  while (e != 0)
-    {
-      b = mulredc (b, b, n, ni);
-      e >>= 1;
-
-      if (e & 1)
-        y = mulredc (y, b, n, ni);
-    }
-
-  return y;
-}
-
-static uintmax_t
-powm2 (uintmax_t *r1m,
-       const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
-       uintmax_t ni, const uintmax_t *one)
-{
-  uintmax_t r1, r0, b1, b0, n1, n0;
-  unsigned int i;
-  uintmax_t e;
-
-  b0 = bp[0];
-  b1 = bp[1];
-  n0 = np[0];
-  n1 = np[1];
-
-  r0 = one[0];
-  r1 = one[1];
-
-  for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
-    {
-      if (e & 1)
-        {
-          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
-          r1 = *r1m;
-        }
-      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
-      b1 = *r1m;
-    }
-  for (e = ep[1]; e > 0; e >>= 1)
-    {
-      if (e & 1)
-        {
-          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
-          r1 = *r1m;
-        }
-      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
-      b1 = *r1m;
-    }
-  *r1m = r1;
-  return r0;
-}
-
-static bool _GL_ATTRIBUTE_CONST
-millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
-             unsigned int k, uintmax_t one)
-{
-  uintmax_t y = powm (b, q, n, ni, one);
-
-  uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
-
-  if (y == one || y == nm1)
-    return true;
-
-  for (unsigned int i = 1; i < k; i++)
-    {
-      y = mulredc (y, y, n, ni);
-
-      if (y == nm1)
-        return true;
-      if (y == one)
-        return false;
-    }
-  return false;
-}
-
-static bool
-millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
-              const uintmax_t *qp, unsigned int k, const uintmax_t *one)
-{
-  uintmax_t y1, y0, nm1_1, nm1_0, r1m;
-
-  y0 = powm2 (&r1m, bp, qp, np, ni, one);
-  y1 = r1m;
-
-  if (y0 == one[0] && y1 == one[1])
-    return true;
-
-  sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
-
-  if (y0 == nm1_0 && y1 == nm1_1)
-    return true;
-
-  for (unsigned int i = 1; i < k; i++)
-    {
-      y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
-      y1 = r1m;
-
-      if (y0 == nm1_0 && y1 == nm1_1)
-        return true;
-      if (y0 == one[0] && y1 == one[1])
-        return false;
-    }
-  return false;
-}
-
-#if HAVE_GMP
-static bool
-mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
-                mpz_srcptr q, unsigned long int k)
-{
-  mpz_powm (y, x, q, n);
-
-  if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
-    return true;
-
-  for (unsigned long int i = 1; i < k; i++)
-    {
-      mpz_powm_ui (y, y, 2, n);
-      if (mpz_cmp (y, nm1) == 0)
-        return true;
-      if (mpz_cmp_ui (y, 1) == 0)
-        return false;
-    }
-  return false;
-}
-#endif
-
-/* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
-   have been observed.  The average seem to be about 2.  */
-static bool
-prime_p (uintmax_t n)
-{
-  int k;
-  bool is_prime;
-  uintmax_t a_prim, one, ni;
-  struct factors factors;
-
-  if (n <= 1)
-    return false;
-
-  /* We have already casted out small primes. */
-  if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
-    return true;
-
-  /* Precomputation for Miller-Rabin.  */
-  uintmax_t q = n - 1;
-  for (k = 0; (q & 1) == 0; k++)
-    q >>= 1;
-
-  uintmax_t a = 2;
-  binv (ni, n);                 /* ni <- 1/n mod B */
-  redcify (one, 1, n);
-  addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
-
-  /* Perform a Miller-Rabin test, finds most composites quickly.  */
-  if (!millerrabin (n, ni, a_prim, q, k, one))
-    return false;
-
-  if (flag_prove_primality)
-    {
-      /* Factor n-1 for Lucas.  */
-      factor (0, n - 1, &factors);
-    }
-
-  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
-     number composite.  */
-  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
-    {
-      if (flag_prove_primality)
-        {
-          is_prime = true;
-          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
-            {
-              is_prime = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != 
one;
-            }
-        }
-      else
-        {
-          /* After enough Miller-Rabin runs, be content. */
-          is_prime = (r == MR_REPS - 1);
-        }
-
-      if (is_prime)
-        return true;
-
-      a += primes_diff[r];      /* Establish new base.  */
-
-      /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
-         on most processors, since it avoids udiv_qrnnd.  If we go down the
-         udiv_qrnnd_preinv path, this code should be replaced.  */
-      {
-        uintmax_t dummy, s1, s0;
-        umul_ppmm (s1, s0, one, a);
-        if (LIKELY (s1 == 0))
-          a_prim = s0 % n;
-        else
-          udiv_qrnnd (dummy, a_prim, s1, s0, n);
-      }
-
-      if (!millerrabin (n, ni, a_prim, q, k, one))
-        return false;
-    }
-
-  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
-  abort ();
-}
-
-static bool
-prime2_p (uintmax_t n1, uintmax_t n0)
-{
-  uintmax_t q[2], nm1[2];
-  uintmax_t a_prim[2];
-  uintmax_t one[2];
-  uintmax_t na[2];
-  uintmax_t ni;
-  unsigned int k;
-  struct factors factors;
-
-  if (n1 == 0)
-    return prime_p (n0);
-
-  nm1[1] = n1 - (n0 == 0);
-  nm1[0] = n0 - 1;
-  if (nm1[0] == 0)
-    {
-      count_trailing_zeros (k, nm1[1]);
-
-      q[0] = nm1[1] >> k;
-      q[1] = 0;
-      k += W_TYPE_SIZE;
-    }
-  else
-    {
-      count_trailing_zeros (k, nm1[0]);
-      rsh2 (q[1], q[0], nm1[1], nm1[0], k);
-    }
-
-  uintmax_t a = 2;
-  binv (ni, n0);
-  redcify2 (one[1], one[0], 1, n1, n0);
-  addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
-
-  /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
-  na[0] = n0;
-  na[1] = n1;
-
-  if (!millerrabin2 (na, ni, a_prim, q, k, one))
-    return false;
-
-  if (flag_prove_primality)
-    {
-      /* Factor n-1 for Lucas.  */
-      factor (nm1[1], nm1[0], &factors);
-    }
-
-  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
-     number composite.  */
-  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
-    {
-      bool is_prime;
-      uintmax_t e[2], y[2];
-
-      if (flag_prove_primality)
-        {
-          is_prime = true;
-          if (factors.plarge[1])
-            {
-              uintmax_t pi;
-              binv (pi, factors.plarge[0]);
-              e[0] = pi * nm1[0];
-              e[1] = 0;
-              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
-              is_prime = (y[0] != one[0] || y[1] != one[1]);
-            }
-          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
-            {
-              /* FIXME: We always have the factor 2. Do we really need to 
handle it
-                 here? We have done the same powering as part of millerrabin. 
*/
-              if (factors.p[i] == 2)
-                rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
-              else
-                divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
-              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
-              is_prime = (y[0] != one[0] || y[1] != one[1]);
-            }
-        }
-      else
-        {
-          /* After enough Miller-Rabin runs, be content. */
-          is_prime = (r == MR_REPS - 1);
-        }
-
-      if (is_prime)
-        return true;
-
-      a += primes_diff[r];      /* Establish new base.  */
-      redcify2 (a_prim[1], a_prim[0], a, n1, n0);
-
-      if (!millerrabin2 (na, ni, a_prim, q, k, one))
-        return false;
-    }
-
-  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
-  abort ();
-}
-
-#if HAVE_GMP
-static bool
-mp_prime_p (mpz_t n)
-{
-  bool is_prime;
-  mpz_t q, a, nm1, tmp;
-  struct mp_factors factors;
-
-  if (mpz_cmp_ui (n, 1) <= 0)
-    return false;
-
-  /* We have already casted out small primes. */
-  if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
-    return true;
-
-  mpz_inits (q, a, nm1, tmp, NULL);
-
-  /* Precomputation for Miller-Rabin.  */
-  mpz_sub_ui (nm1, n, 1);
-
-  /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
-  unsigned long int k = mpz_scan1 (nm1, 0);
-  mpz_tdiv_q_2exp (q, nm1, k);
-
-  mpz_set_ui (a, 2);
-
-  /* Perform a Miller-Rabin test, finds most composites quickly.  */
-  if (!mp_millerrabin (n, nm1, a, tmp, q, k))
-    {
-      is_prime = false;
-      goto ret2;
-    }
-
-  if (flag_prove_primality)
-    {
-      /* Factor n-1 for Lucas.  */
-      mpz_set (tmp, nm1);
-      mp_factor (tmp, &factors);
-    }
-
-  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
-     number composite.  */
-  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
-    {
-      if (flag_prove_primality)
-        {
-          is_prime = true;
-          for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
-            {
-              mpz_divexact (tmp, nm1, factors.p[i]);
-              mpz_powm (tmp, a, tmp, n);
-              is_prime = mpz_cmp_ui (tmp, 1) != 0;
-            }
-        }
-      else
-        {
-          /* After enough Miller-Rabin runs, be content. */
-          is_prime = (r == MR_REPS - 1);
-        }
-
-      if (is_prime)
-        goto ret1;
-
-      mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
-
-      if (!mp_millerrabin (n, nm1, a, tmp, q, k))
-        {
-          is_prime = false;
-          goto ret1;
-        }
-    }
-
-  fprintf (stderr, "Lucas prime test failure.  This should not happen\n");
-  abort ();
-
- ret1:
-  if (flag_prove_primality)
-    mp_factor_clear (&factors);
- ret2:
-  mpz_clears (q, a, nm1, tmp, NULL);
-
-  return is_prime;
-}
-#endif
-
-static void
-factor_using_pollard_rho (uintmax_t n, unsigned long int a,
-                          struct factors *factors)
-{
-  uintmax_t x, z, y, P, t, ni, g;
-
-  unsigned long int k = 1;
-  unsigned long int l = 1;
-
-  redcify (P, 1, n);
-  addmod (x, P, P, n);          /* i.e., redcify(2) */
-  y = z = x;
-
-  while (n != 1)
-    {
-      assert (a < n);
-
-      binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
-
-      for (;;)
-        {
-          do
-            {
-              x = mulredc (x, x, n, ni);
-              addmod (x, x, a, n);
-
-              submod (t, z, x, n);
-              P = mulredc (P, t, n, ni);
-
-              if (k % 32 == 1)
-                {
-                  if (gcd_odd (P, n) != 1)
-                    goto factor_found;
-                  y = x;
-                }
-            }
-          while (--k != 0);
-
-          z = x;
-          k = l;
-          l = 2 * l;
-          for (unsigned long int i = 0; i < k; i++)
-            {
-              x = mulredc (x, x, n, ni);
-              addmod (x, x, a, n);
-            }
-          y = x;
-        }
-
-    factor_found:
-      do
-        {
-          y = mulredc (y, y, n, ni);
-          addmod (y, y, a, n);
-
-          submod (t, z, y, n);
-          g = gcd_odd (t, n);
-        }
-      while (g == 1);
-
-      n = n / g;
-
-      if (!prime_p (g))
-        factor_using_pollard_rho (g, a + 1, factors);
-      else
-        factor_insert (factors, g);
-
-      if (prime_p (n))
-        {
-          factor_insert (factors, n);
-          break;
-        }
-
-      x = x % n;
-      z = z % n;
-      y = y % n;
-    }
-}
-
-static void
-factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
-                           struct factors *factors)
-{
-  uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
-
-  unsigned long int k = 1;
-  unsigned long int l = 1;
-
-  redcify2 (P1, P0, 1, n1, n0);
-  addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
-  y1 = z1 = x1;
-  y0 = z0 = x0;
-
-  while (n1 != 0 || n0 != 1)
-    {
-      binv (ni, n0);
-
-      for (;;)
-        {
-          do
-            {
-              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
-              x1 = r1m;
-              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
-
-              submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
-              P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
-              P1 = r1m;
-
-              if (k % 32 == 1)
-                {
-                  g0 = gcd2_odd (&g1, P1, P0, n1, n0);
-                  if (g1 != 0 || g0 != 1)
-                    goto factor_found;
-                  y1 = x1; y0 = x0;
-                }
-            }
-          while (--k != 0);
-
-          z1 = x1; z0 = x0;
-          k = l;
-          l = 2 * l;
-          for (unsigned long int i = 0; i < k; i++)
-            {
-              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
-              x1 = r1m;
-              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
-            }
-          y1 = x1; y0 = x0;
-        }
-
-    factor_found:
-      do
-        {
-          y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
-          y1 = r1m;
-          addmod2 (y1, y0, y1, y0, 0, a, n1, n0);
-
-          submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
-          g0 = gcd2_odd (&g1, t1, t0, n1, n0);
-        }
-      while (g1 == 0 && g0 == 1);
-
-      if (g1 == 0)
-        {
-          /* The found factor is one word. */
-          divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
-
-          if (!prime_p (g0))
-            factor_using_pollard_rho (g0, a + 1, factors);
-          else
-            factor_insert (factors, g0);
-        }
-      else
-        {
-          /* The found factor is two words.  This is highly unlikely, thus hard
-             to trigger.  Please be careful before you change this code!  */
-          uintmax_t ginv;
-
-          binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
-          n0 = ginv * n0;       /* fit one word, we can compute the quotient */
-          n1 = 0;               /* modulo B, ignoring the high divisor word. */
-
-          if (!prime2_p (g1, g0))
-            factor_using_pollard_rho2 (g1, g0, a + 1, factors);
-          else
-            factor_insert_large (factors, g1, g0);
-        }
-
-      if (n1 == 0)
-        {
-          if (prime_p (n0))
-            {
-              factor_insert (factors, n0);
-              break;
-            }
-
-          factor_using_pollard_rho (n0, a, factors);
-          return;
-        }
-
-      if (prime2_p (n1, n0))
-        {
-          factor_insert_large (factors, n1, n0);
-          break;
-        }
-
-      x0 = mod2 (&x1, x1, x0, n1, n0);
-      z0 = mod2 (&z1, z1, z0, n1, n0);
-      y0 = mod2 (&y1, y1, y0, n1, n0);
-    }
-}
-
-#if HAVE_GMP
-static void
-mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
-                             struct mp_factors *factors)
-{
-  mpz_t x, z, y, P;
-  mpz_t t, t2;
-  unsigned long long int k, l;
-
-  if (flag_verbose > 0)
-    {
-      printf ("[pollard-rho (%lu)] ", a);
-    }
-
-  mpz_inits (t, t2, NULL);
-  mpz_init_set_si (y, 2);
-  mpz_init_set_si (x, 2);
-  mpz_init_set_si (z, 2);
-  mpz_init_set_ui (P, 1);
-  k = 1;
-  l = 1;
-
-  while (mpz_cmp_ui (n, 1) != 0)
-    {
-      for (;;)
-        {
-          do
-            {
-              mpz_mul (t, x, x);
-              mpz_mod (x, t, n);
-              mpz_add_ui (x, x, a);
-
-              mpz_sub (t, z, x);
-              mpz_mul (t2, P, t);
-              mpz_mod (P, t2, n);
-
-              if (k % 32 == 1)
-                {
-                  mpz_gcd (t, P, n);
-                  if (mpz_cmp_ui (t, 1) != 0)
-                    goto factor_found;
-                  mpz_set (y, x);
-                }
-            }
-          while (--k != 0);
-
-          mpz_set (z, x);
-          k = l;
-          l = 2 * l;
-          for (unsigned long long int i = 0; i < k; i++)
-            {
-              mpz_mul (t, x, x);
-              mpz_mod (x, t, n);
-              mpz_add_ui (x, x, a);
-            }
-          mpz_set (y, x);
-        }
-
-    factor_found:
-      do
-        {
-          mpz_mul (t, y, y);
-          mpz_mod (y, t, n);
-          mpz_add_ui (y, y, a);
-
-          mpz_sub (t, z, y);
-          mpz_gcd (t, t, n);
-        }
-      while (mpz_cmp_ui (t, 1) == 0);
-
-      mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
-
-      if (!mp_prime_p (t))
-        {
-          if (flag_verbose > 0)
-            {
-              printf ("[composite factor--restarting pollard-rho] ");
-            }
-          mp_factor_using_pollard_rho (t, a + 1, factors);
-        }
-      else
-        {
-          mp_factor_insert (factors, t);
-        }
-
-      if (mp_prime_p (n))
-        {
-          mp_factor_insert (factors, n);
-          break;
-        }
-
-      mpz_mod (x, x, n);
-      mpz_mod (z, z, n);
-      mpz_mod (y, y, n);
-    }
-
-  mpz_clears (P, t2, t, z, x, y, NULL);
-}
-#endif
-
-/* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
-   algorithm is replaced, consider also returning the remainder. */
-static uintmax_t _GL_ATTRIBUTE_CONST
-isqrt (uintmax_t n)
-{
-  uintmax_t x;
-  unsigned c;
-  if (n == 0)
-    return 0;
-
-  count_leading_zeros (c, n);
-
-  /* Make x > sqrt(n). This will be invariant through the loop. */
-  x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
-
-  for (;;)
-    {
-      uintmax_t y = (x + n/x) / 2;
-      if (y >= x)
-        return x;
-
-      x = y;
-    }
-}
-
-static uintmax_t _GL_ATTRIBUTE_CONST
-isqrt2 (uintmax_t nh, uintmax_t nl)
-{
-  unsigned int shift;
-  uintmax_t x;
-
-  /* Ensures the remainder fits in an uintmax_t. */
-  assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
-
-  if (nh == 0)
-    return isqrt (nl);
-
-  count_leading_zeros (shift, nh);
-  shift &= ~1;
-
-  /* Make x > sqrt(n) */
-  x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
-  x <<= (W_TYPE_SIZE - shift) / 2;
-
-  /* Do we need more than one iteration? */
-  for (;;)
-    {
-      uintmax_t q, r, y;
-      udiv_qrnnd (q, r, nh, nl, x);
-      y = (x + q) / 2;
-
-      if (y >= x)
-        {
-          uintmax_t hi, lo;
-          umul_ppmm (hi, lo, x + 1, x + 1);
-          assert (gt2 (hi, lo, nh, nl));
-
-          umul_ppmm (hi, lo, x, x);
-          assert (ge2 (nh, nl, hi, lo));
-          sub_ddmmss (hi, lo, nh, nl, hi, lo);
-          assert (hi == 0);
-
-          return x;
-        }
-
-      x = y;
-    }
-}
-
-/* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
-#define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
-#define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
-#define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
-#define MAGIC11 0x23b
-
-/* Returns the square root if the input is a square, otherwise 0. */
-static uintmax_t _GL_ATTRIBUTE_CONST
-is_square (uintmax_t x)
-{
-  /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
-     computing the square root. */
-  if (((MAGIC64 >> (x & 63)) & 1)
-      && ((MAGIC63 >> (x % 63)) & 1)
-      /* Both 0 and 64 are squares mod (65) */
-      && ((MAGIC65 >> ((x % 65) & 63)) & 1)
-      && ((MAGIC11 >> (x % 11) & 1)))
-    {
-      uintmax_t r = isqrt (x);
-      if (r*r == x)
-        return r;
-    }
-  return 0;
-}
-
-/* invtab[i] = floor(0x10000 / (0x100 + i) */
-static const unsigned short invtab[0x81] =
-  {
-    0x200,
-    0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
-    0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
-    0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
-    0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
-    0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
-    0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
-    0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
-    0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
-    0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
-    0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
-    0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
-    0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
-    0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
-    0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
-    0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
-    0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
-  };
-
-/* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
-   that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
-#define div_smallq(q, r, u, d)                                          \
-  do {                                                                  \
-    if ((u) / 0x40 < (d))                                               \
-      {                                                                 \
-        int _cnt;                                                       \
-        uintmax_t _dinv, _mask, _q, _r;                                 \
-        count_leading_zeros (_cnt, (d));                                \
-        _r = (u);                                                       \
-        if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
-          {                                                             \
-            _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
-            _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
-          }                                                             \
-        else                                                            \
-          {                                                             \
-            _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
-            _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
-          }                                                             \
-        _r -= _q*(d);                                                   \
-                                                                        \
-        _mask = -(uintmax_t) (_r >= (d));                               \
-        (r) = _r - (_mask & (d));                                       \
-        (q) = _q - _mask;                                               \
-        assert ( (q) * (d) + (r) == u);                                 \
-      }                                                                 \
-    else                                                                \
-      {                                                                 \
-        uintmax_t _q = (u) / (d);                                       \
-        (r) = (u) - _q * (d);                                           \
-        (q) = _q;                                                       \
-      }                                                                 \
-  } while (0)
-
-/* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
-   = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
-   with 3025 = 55^2.
-
-   Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
-   representing G_0 = (-55, 2*4652, 8653).
-
-   In the notation of the paper:
-
-   S_{-1} = 55, S_0 = 8653, R_0 = 4652
-
-   Put
-
-     t_0 = floor([q_0 + R_0] / S0) = 1
-     R_1 = t_0 * S_0 - R_0 = 4001
-     S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
-*/
-
-/* Multipliers, in order of efficiency:
-   0.7268  3*5*7*11 = 1155 = 3 (mod 4)
-   0.7317  3*5*7    =  105 = 1
-   0.7820  3*5*11   =  165 = 1
-   0.7872  3*5      =   15 = 3
-   0.8101  3*7*11   =  231 = 3
-   0.8155  3*7      =   21 = 1
-   0.8284  5*7*11   =  385 = 1
-   0.8339  5*7      =   35 = 3
-   0.8716  3*11     =   33 = 1
-   0.8774  3        =    3 = 3
-   0.8913  5*11     =   55 = 3
-   0.8972  5        =    5 = 1
-   0.9233  7*11     =   77 = 1
-   0.9295  7        =    7 = 3
-   0.9934  11       =   11 = 3
-*/
-#define QUEUE_SIZE 50
-
-#if STAT_SQUFOF
-# define Q_FREQ_SIZE 50
-/* Element 0 keeps the total */
-static unsigned int q_freq[Q_FREQ_SIZE + 1];
-# define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-
-/* Return true on success.  Expected to fail only for numbers
-   >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
-static bool
-factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
-{
-  /* Uses algorithm and notation from
-
-     SQUARE FORM FACTORIZATION
-     JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
-
-     http://homes.cerias.purdue.edu/~ssw/squfof.pdf
-   */
-
-  static const unsigned int multipliers_1[] =
-    { /* = 1 (mod 4) */
-      105, 165, 21, 385, 33, 5, 77, 1, 0
-    };
-  static const unsigned int multipliers_3[] =
-    { /* = 3 (mod 4) */
-      1155, 15, 231, 35, 3, 55, 7, 11, 0
-    };
-
-  const unsigned int *m;
-
-  struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
-
-  if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
-    return false;
-
-  uintmax_t sqrt_n = isqrt2 (n1, n0);
-
-  if (n0 == sqrt_n * sqrt_n)
-    {
-      uintmax_t p1, p0;
-
-      umul_ppmm (p1, p0, sqrt_n, sqrt_n);
-      assert (p0 == n0);
-
-      if (n1 == p1)
-        {
-          if (prime_p (sqrt_n))
-            factor_insert_multiplicity (factors, sqrt_n, 2);
-          else
-            {
-              struct factors f;
-
-              f.nfactors = 0;
-              if (!factor_using_squfof (0, sqrt_n, &f))
-                {
-                  /* Try pollard rho instead */
-                  factor_using_pollard_rho (sqrt_n, 1, &f);
-                }
-              /* Duplicate the new factors */
-              for (unsigned int i = 0; i < f.nfactors; i++)
-                factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
-            }
-          return true;
-        }
-    }
-
-  /* Select multipliers so we always get n * mu = 3 (mod 4) */
-  for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
-       *m; m++)
-    {
-      uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
-      unsigned int i;
-      unsigned int mu = *m;
-      unsigned int qpos = 0;
-
-      assert (mu * n0 % 4 == 3);
-
-      /* In the notation of the paper, with mu * n == 3 (mod 4), we
-         get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
-         I understand it, the necessary bound is 4 \mu^3 < n, or 32
-         mu^3 < n.
-
-         However, this seems insufficient: With n = 37243139 and mu =
-         105, we get a trivial factor, from the square 38809 = 197^2,
-         without any corresponding Q earlier in the iteration.
-
-         Requiring 64 mu^3 < n seems sufficient. */
-      if (n1 == 0)
-        {
-          if ((uintmax_t) mu*mu*mu >= n0 / 64)
-            continue;
-        }
-      else
-        {
-          if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
-            continue;
-        }
-      umul_ppmm (Dh, Dl, n0, mu);
-      Dh += n1 * mu;
-
-      assert (Dl % 4 != 1);
-      assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
-
-      S = isqrt2 (Dh, Dl);
-
-      Q1 = 1;
-      P = S;
-
-      /* Square root remainder fits in one word, so ignore high part. */
-      Q = Dl - P*P;
-      /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
-      L = isqrt (2*S);
-      B = 2*L;
-      L1 = mu * 2 * L;
-
-      /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
-         4 D. */
-
-      for (i = 0; i <= B; i++)
-        {
-          uintmax_t q, P1, t, rem;
-
-          div_smallq (q, rem, S+P, Q);
-          P1 = S - rem; /* P1 = q*Q - P */
-
-#if STAT_SQUFOF
-          assert (q > 0);
-          q_freq[0]++;
-          q_freq[MIN(q, Q_FREQ_SIZE)]++;
-#endif
-
-          if (Q <= L1)
-            {
-              uintmax_t g = Q;
-
-              if ( (Q & 1) == 0)
-                g /= 2;
-
-              g /= gcd_odd (g, mu);
-
-              if (g <= L)
-                {
-                  if (qpos >= QUEUE_SIZE)
-                    {
-                      fprintf (stderr, "squfof queue overflow.\n");
-                      exit (EXIT_FAILURE);
-                    }
-                  queue[qpos].Q = g;
-                  queue[qpos].P = P % g;
-                  qpos++;
-                }
-            }
-
-          /* I think the difference can be either sign, but mod
-             2^W_TYPE_SIZE arithmetic should be fine. */
-          t = Q1 + q * (P - P1);
-          Q1 = Q;
-          Q = t;
-          P = P1;
-
-          if ( (i & 1) == 0)
-            {
-              uintmax_t r = is_square (Q);
-              if (r)
-                {
-                  unsigned int j;
-
-                  for (j = 0; j < qpos; j++)
-                    {
-                      if (queue[j].Q == r)
-                        {
-                          if (r == 1)
-                            /* Traversed entire cycle. */
-                            goto next_multiplier;
-
-                          /* Need the absolute value for divisibility test. */
-                          if (P >= queue[j].P)
-                            t = P - queue[j].P;
-                          else
-                            t = queue[j].P - P;
-                          if (t % r == 0)
-                            {
-                              /* Delete entries up to and including entry
-                                 j, which matched. */
-                              memmove (queue, queue + j + 1,
-                                       (qpos - j - 1) * sizeof (queue[0]));
-                              qpos -= (j + 1);
-                            }
-                          goto next_i;
-                        }
-                    }
-
-                  /* We have found a square form, which should give a
-                     factor. */
-                  Q1 = r;
-                  assert (S >= P); /* What signs are possible? */
-                  P += r * ((S - P) / r);
-
-                  /* Note: Paper says (N - P*P) / Q1, that seems incorrect
-                     for the case D = 2N. */
-                  /* Compute Q = (D - P*P) / Q1, but we need double
-                     precision. */
-                  uintmax_t hi, lo;
-                  umul_ppmm (hi, lo, P, P);
-                  sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
-                  udiv_qrnnd (Q, rem, hi, lo, Q1);
-                  assert (rem == 0);
-
-                  for (;;)
-                    {
-                      /* Note: There appears to by a typo in the paper,
-                         Step 4a in the algorithm description says q <--
-                         floor([S+P]/\hat Q), but looking at the equations
-                         in Sec. 3.1, it should be q <-- floor([S+P] / Q).
-                         (In this code, \hat Q is Q1). */
-                      div_smallq (q, rem, S+P, Q);
-                      P1 = S - rem;     /* P1 = q*Q - P */
-
-#if STAT_SQUFOF
-                      q_freq[0]++;
-                      q_freq[MIN (q, Q_FREQ_SIZE)]++;
-#endif
-                      if (P == P1)
-                        break;
-                      t = Q1 + q * (P - P1);
-                      Q1 = Q;
-                      Q = t;
-                      P = P1;
-                    }
-
-                  if ( (Q & 1) == 0)
-                    Q /= 2;
-                  Q /= gcd_odd (Q, mu);
-
-                  assert (Q > 1 && (n1 || Q < n0));
-
-                  if (prime_p (Q))
-                    factor_insert (factors, Q);
-                  else if (!factor_using_squfof (0, Q, factors))
-                    factor_using_pollard_rho (Q, 2, factors);
-
-                  divexact_21 (n1, n0, n1, n0, Q);
-
-                  if (prime2_p (n1, n0))
-                    factor_insert_large (factors, n1, n0);
-                  else
-                    {
-                      if (!factor_using_squfof (n1, n0, factors))
-                        {
-                          if (n1 == 0)
-                            factor_using_pollard_rho (n0, 1, factors);
-                          else
-                            factor_using_pollard_rho2 (n1, n0, 1, factors);
-                        }
-                    }
-
-                  return true;
-                }
-            }
-        next_i:
-          ;
-        }
-    next_multiplier:
-      ;
-    }
-  return false;
-}
-
-static void
-factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
-{
-  factors->nfactors = 0;
-  factors->plarge[1] = 0;
-
-  if (t1 == 0 && t0 < 2)
-    return;
-
-  t0 = factor_using_division (&t1, t1, t0, factors);
-
-  if (t1 == 0 && t0 < 2)
-    return;
-
-  if (prime2_p (t1, t0))
-    factor_insert_large (factors, t1, t0);
-  else
-    {
-      if (alg == ALG_SQUFOF)
-        if (factor_using_squfof (t1, t0, factors))
-          return;
-
-      if (t1 == 0)
-        factor_using_pollard_rho (t0, 1, factors);
-      else
-        factor_using_pollard_rho2 (t1, t0, 1, factors);
-    }
-}
-
-#if HAVE_GMP
-static void
-mp_factor (mpz_t t, struct mp_factors *factors)
-{
-  mp_factor_init (factors);
-
-  if (mpz_sgn (t) != 0)
-    {
-      mp_factor_using_division (t, factors);
-
-      if (mpz_cmp_ui (t, 1) != 0)
-        {
-          if (flag_verbose > 0)
-            {
-              printf ("[is number prime?] ");
-            }
-          if (mp_prime_p (t))
-            mp_factor_insert (factors, t);
-          else
-            mp_factor_using_pollard_rho (t, 1, factors);
-        }
-    }
-}
-#endif
-
-static int
-strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
-{
-  int errcode;
-  unsigned int lo_carry;
-  uintmax_t hi, lo;
-
-  errcode = -1;
-
-  hi = lo = 0;
-  for (;;)
-    {
-      unsigned int c = *s++;
-      if (c == 0)
-        break;
-
-      if (UNLIKELY (c < '0' || c > '9'))
-        {
-          errcode = -1;
-          break;
-        }
-      c -= '0';
-
-      errcode = 0;              /* we've seen at least one valid digit */
-
-      if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
-        {
-          errcode = -1; /* overflow */
-          break;
-        }
-      hi = 10 * hi;
-
-      lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
-      lo_carry += 10 * lo < 2 * lo;
-
-      lo = 10 * lo;
-      lo += c;
-
-      lo_carry += lo < c;
-      hi += lo_carry;
-      if (UNLIKELY (hi < lo_carry))
-        {
-          errcode = -1; /* overflow */
-          break;
-        }
-    }
-
-  *hip = hi;
-  *lop = lo;
-
-  return errcode;
-}
-
-static void
-print_uintmaxes (uintmax_t t1, uintmax_t t0)
-{
-  uintmax_t q, r;
-
-  if (t1 == 0)
-    printf ("%ju", t0);
-  else
-    {
-      /* Use very plain code here since it seems hard to write fast code
-         without assuming a specific word size.  */
-      q = t1 / 1000000000;
-      r = t1 % 1000000000;
-      udiv_qrnnd (t0, r, r, t0, 1000000000);
-      print_uintmaxes (q, t0);
-      printf ("%09u", (int) r);
-    }
-}
-
-static void
-factor_one (const char *input)
-{
-  uintmax_t t1, t0;
-  int errcode;
-
-  /* Try converting the number to one or two words.  If it fails, use GMP or
-     print an error message.  The 2nd condition checks that the most
-     significant bit of the two-word number is clear, in a typesize neutral
-     way.  */
-  errcode = strto2uintmax (&t1, &t0, input);
-  if (errcode == 0 && ((t1 << 1) >> 1) == t1)
-    {
-      struct factors factors;
-
-      print_uintmaxes (t1, t0);
-      printf (":");
-
-      factor (t1, t0, &factors);
-
-      for (unsigned int j = 0; j < factors.nfactors; j++)
-        for (unsigned int k = 0; k < factors.e[j]; k++)
-          printf (" %ju", factors.p[j]);
-
-      if (factors.plarge[1])
-        {
-          printf (" ");
-          print_uintmaxes (factors.plarge[1], factors.plarge[0]);
-        }
-      puts ("");
-    }
-  else
-    {
-#if HAVE_GMP
-      mpz_t t;
-      struct mp_factors factors;
-
-      mpz_init_set_str (t, input, 10);
-
-      gmp_printf ("%Zd:", t);
-      mp_factor (t, &factors);
-
-      for (unsigned int j = 0; j < factors.nfactors; j++)
-        for (unsigned int k = 0; k < factors.e[j]; k++)
-          gmp_printf (" %Zd", factors.p[j]);
-
-      mp_factor_clear (&factors);
-      mpz_clear (t);
-      puts ("");
-#else
-      fprintf (stderr, "error: number %s not parsable or too large\n", input);
-      exit (EXIT_FAILURE);
-#endif
-    }
-}
-
-struct inbuf
-{
-  char *buf;
-  size_t alloc;
-};
-
-/* Read white-space delimited items. Return 1 on success, 0 on EOF.
-   Exit on I/O errors. */
-int
-read_item (struct inbuf *bufstruct)
-{
-  int c;
-  size_t i;
-  char *buf = bufstruct->buf;
-
-  do
-    c = getchar_unlocked ();
-  while (isspace (c));
-
-  for (i = 0; !isspace(c); i++)
-    {
-      if (c < 0)
-        {
-          if (ferror (stdin))
-            {
-              fprintf (stderr, "read error on stdin: %s\n", strerror(errno));
-              exit (EXIT_FAILURE);
-            }
-          if (i == 0)
-            return 0;
-          else
-            break;
-        }
-
-      if (UNLIKELY (bufstruct->alloc <= i + 1)) /* +1 byte for terminating NUL 
*/
-        {
-          bufstruct->alloc = bufstruct->alloc * 5 / 4 + 1;
-          bufstruct->buf = realloc (bufstruct->buf, bufstruct->alloc);
-          buf = bufstruct->buf;
-        }
-
-      buf[i] = c;
-      c = getchar_unlocked ();
-    }
-
-  buf[i] = '\0';
-  return 1;
-}
-
-int
-main (int argc, char *argv[])
-{
-  int c;
-
-  alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
-
-  while ( (c = getopt(argc, argv, "svw")) != -1)
-    switch (c)
-      {
-      case 's':
-        alg = ALG_SQUFOF;
-        break;
-      case 'v':
-        flag_verbose = 1;
-        break;
-      case 'w':
-        flag_prove_primality = 0;
-        break;
-      case '?':
-        printf ("Usage: %s [-s] number ...\n", argv[0]);
-        return EXIT_FAILURE;
-      }
-#if STAT_SQUFOF
-  if (alg == ALG_SQUFOF)
-    memset (q_freq, 0, sizeof (q_freq));
-#endif
-
-  if (optind < argc)
-    for (int i = optind; i < argc; i++)
-      factor_one (argv[i]);
-  else
-    {
-      struct inbuf bufstruct;
-      bufstruct.alloc = 50;     /* enough unless HAVE_GMP */
-      bufstruct.buf = malloc (bufstruct.alloc);
-      while (read_item (&bufstruct))
-        factor_one (bufstruct.buf);
-    }
-
-#if STAT_SQUFOF
-  if (alg == ALG_SQUFOF && q_freq[0] > 0)
-    {
-      double acc_f;
-      printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
-      for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
-        {
-          double f = (double) q_freq[i] / q_freq[0];
-          acc_f += f;
-          printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
-                  100.0 * f, 100.0 * acc_f);
-        }
-    }
-#endif
-
-  exit (EXIT_SUCCESS);
-}
diff --git a/src/factor.c b/src/factor.c
index e63e0e0..84d9721 100644
--- a/src/factor.c
+++ b/src/factor.c
@@ -14,17 +14,79 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

-/* Written by Paul Rubin <address@hidden>.
+/* Originally written by Paul Rubin <address@hidden>.
    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
    Arbitrary-precision code adapted by James Youngman from Torbjörn
    Granlund's factorize.c, from GNU MP version 4.2.2.
+   In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
+   Contains code from GNU MP.  */
+
+/* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
+   or, with GMP, numbers of any size.
+
+  Code organisation:
+
+    There are several variants of many functions, for handling one word, two
+    words, and GMP's mpz_t type.  If the one-word variant is called foo, the
+    two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
+    some cases, the plain function variants will handle both one-word and
+    two-word numbers, evidenced by function arguments.
+
+    The factoring code for two words will fall into the code for one word when
+    progress allows that.
+
+    Using GMP is optional.  Define HAVE_GMP to make this code include GMP
+    factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
+    (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
+    factoring code even if numbers get small enough for using the two-word
+    code.
+
+  Algorithm:
+
+    (1) Perform trial division using a small primes table, but without hardware
+        division since the primes table store inverses modulo the word base.
+        (The GMP variant of this code doesn't make use of the precomputed
+        inverses, but instead relies on GMP for fast divisibility testing.)
+    (2) Check the nature of any non-factored part using Miller-Rabin for
+        detecting composites, and Lucas for detecting primes.
+    (3) Factor any remaining composite part using the Pollard-Brent rho
+        algorithm or the SQUFOF algorithm, checking status of found factors
+        again using Miller-Rabin and Lucas.
+
+    We prefer using Hensel norm in the divisions, not the more familiar
+    Euclidian norm, since the former leads to much faster code.  In the
+    Pollard-Brent rho code and the prime testing code, we use Montgomery's
+    trick of multiplying all n-residues by the word base, allowing cheap Hensel
+    reductions mod n.
+
+  Improvements:
+
+    * Use modular inverses also for exact division in the Lucas code, and
+      elsewhere.  A problem is to locate the inverses not from an index, but
+      from a prime.  We might instead compute the inverse on-the-fly.
+
+    * Tune trial division table size (not forgetting that this is a standalone
+      program where the table will be read from disk for each invocation).
+
+    * Implement less naive powm, using k-ary exponentiation for k = 3 or
+      perhaps k = 4.
+
+    * Try to speed trial division code for single uintmax_t numbers, i.e., the
+      code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
+      IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
+      using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
+      respectively cycles ought to be possible.
+
+    * The redcify function could be vastly improved by using (plain Euclidian)
+      pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
+      GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
+      similar methoods.  These functions currently dominate run time when using
+      the -w option.
 */

 #include <config.h>
 #include <getopt.h>
-#include <stdarg.h>
 #include <stdio.h>
-#include <sys/types.h>
 #if HAVE_GMP
 # include <gmp.h>
 #endif
@@ -40,17 +102,540 @@
 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "factor"

-#define AUTHORS proper_name ("Paul Rubin")
+#define AUTHORS \
+  proper_name ("Paul Rubin"),                                           \
+  proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
+  proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")

 /* Token delimiters when reading from a file.  */
 #define DELIM "\n\t "

-static bool verbose = false;
+#ifndef STAT_SQUFOF
+# define STAT_SQUFOF 0
+#endif
+
+#ifndef USE_LONGLONG_H
+# define USE_LONGLONG_H 1
+#endif
+
+#if USE_LONGLONG_H
+
+/* Make definitions for longlong.h to make it do what it can do for us */
+# define W_TYPE_SIZE 64          /* bitcount for uintmax_t */
+# define UWtype  uintmax_t
+# define UHWtype unsigned long int
+# undef UDWtype
+# if HAVE_ATTRIBUTE_MODE
+typedef unsigned int UQItype    __attribute__ ((mode (QI)));
+typedef          int SItype     __attribute__ ((mode (SI)));
+typedef unsigned int USItype    __attribute__ ((mode (SI)));
+typedef          int DItype     __attribute__ ((mode (DI)));
+typedef unsigned int UDItype    __attribute__ ((mode (DI)));
+# else
+typedef unsigned char UQItype;
+typedef          long SItype;
+typedef unsigned long int USItype;
+#  if HAVE_LONG_LONG
+typedef long long int DItype;
+typedef unsigned long long int UDItype;
+#  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
+typedef long int DItype;
+typedef unsigned long int UDItype;
+#  endif
+# endif
+# define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files 
*/
+# define ASSERT(x)               /* FIXME make longlong.h really standalone */
+# define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
+# ifndef __GMP_GNUC_PREREQ
+#  define __GMP_GNUC_PREREQ(a,b) 1
+# endif
+# if _ARCH_PPC
+#  define HAVE_HOST_CPU_FAMILY_powerpc 1
+# endif
+# include "longlong.h"
+# ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+const unsigned char factor_clz_tab[129] =
+{
+  1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  9
+};
+# endif
+
+#else /* not USE_LONGLONG_H */
+
+# define W_TYPE_SIZE (8 * sizeof (uintmax_t))
+# define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
+# define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
+# define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
+
+#endif
+
+enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
+
+static enum alg_type alg;
+
+/* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
+#define MAX_NFACTS 26
+
+enum
+{
+  VERBOSE_OPTION = CHAR_MAX + 1
+};
+
+static struct option const long_options[] =
+{
+  {"verbose", no_argument, NULL, VERBOSE_OPTION},
+  {GETOPT_HELP_OPTION_DECL},
+  {GETOPT_VERSION_OPTION_DECL},
+  {NULL, 0, NULL, 0}
+};
+
+struct factors
+{
+  uintmax_t     plarge[2]; /* Can have a single large factor */
+  uintmax_t     p[MAX_NFACTS];
+  unsigned char e[MAX_NFACTS];
+  unsigned char nfactors;
+};

 #if HAVE_GMP
-static mpz_t *factor = NULL;
-static size_t nfactors_found = 0;
-static size_t nfactors_allocated = 0;
+struct mp_factors
+{
+  mpz_t             *p;
+  unsigned long int *e;
+  unsigned long int nfactors;
+};
+#endif
+
+static void factor (uintmax_t, uintmax_t, struct factors *);
+
+#ifndef umul_ppmm
+# define umul_ppmm(w1, w0, u, v)                                        \
+  do {                                                                  \
+    uintmax_t __x0, __x1, __x2, __x3;                                   \
+    unsigned long int __ul, __vl, __uh, __vh;                           \
+    uintmax_t __u = (u), __v = (v);                                     \
+                                                                        \
+    __ul = __ll_lowpart (__u);                                          \
+    __uh = __ll_highpart (__u);                                         \
+    __vl = __ll_lowpart (__v);                                          \
+    __vh = __ll_highpart (__v);                                         \
+                                                                        \
+    __x0 = (uintmax_t) __ul * __vl;                                     \
+    __x1 = (uintmax_t) __ul * __vh;                                     \
+    __x2 = (uintmax_t) __uh * __vl;                                     \
+    __x3 = (uintmax_t) __uh * __vh;                                     \
+                                                                        \
+    __x1 += __ll_highpart (__x0);/* this can't give carry */            \
+    __x1 += __x2;               /* but this indeed can */               \
+    if (__x1 < __x2)            /* did we get it? */                    \
+      __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
+                                                                        \
+    (w1) = __x3 + __ll_highpart (__x1);                                 \
+    (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
+  } while (0)
+#endif
+
+#if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
+/* Define our own, not needing normalization. This function is
+   currently not performance critical, so keep it simple. Similar to
+   the mod macro below. */
+# undef udiv_qrnnd
+# define udiv_qrnnd(q, r, n1, n0, d)                                    \
+  do {                                                                  \
+    uintmax_t __d1, __d0, __q, __r1, __r0;                              \
+                                                                        \
+    assert ((n1) < (d));                                                \
+    __d1 = (d); __d0 = 0;                                               \
+    __r1 = (n1); __r0 = (n0);                                           \
+    __q = 0;                                                            \
+    for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
+      {                                                                 \
+        rsh2 (__d1, __d0, __d1, __d0, 1);                               \
+        __q <<= 1;                                                      \
+        if (ge2 (__r1, __r0, __d1, __d0))                               \
+          {                                                             \
+            __q++;                                                      \
+            sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
+          }                                                             \
+      }                                                                 \
+    (r) = __r0;                                                         \
+    (q) = __q;                                                          \
+  } while (0)
+#endif
+
+#if !defined add_ssaaaa
+# define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
+  do {                                                                  \
+    uintmax_t _add_x;                                                   \
+    _add_x = (al) + (bl);                                               \
+    (sh) = (ah) + (bh) + (_add_x < (al));                               \
+    (sl) = _add_x;                                                      \
+  } while (0)
+#endif
+
+#define rsh2(rh, rl, ah, al, cnt)                                       \
+  do {                                                                  \
+    (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
+    (rh) = (ah) >> (cnt);                                               \
+  } while (0)
+
+#define lsh2(rh, rl, ah, al, cnt)                                       \
+  do {                                                                  \
+    (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
+    (rl) = (al) << (cnt);                                               \
+  } while (0)
+
+#define ge2(ah, al, bh, bl)                                             \
+  ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
+
+#define gt2(ah, al, bh, bl)                                             \
+  ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
+
+#ifndef sub_ddmmss
+# define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
+  do {                                                                  \
+    uintmax_t _cy;                                                      \
+    _cy = (al) < (bl);                                                  \
+    (rl) = (al) - (bl);                                                 \
+    (rh) = (ah) - (bh) - _cy;                                           \
+  } while (0)
+#endif
+
+#ifndef count_leading_zeros
+# define count_leading_zeros(count, x) do {                             \
+    uintmax_t __clz_x = (x);                                            \
+    unsigned int __clz_c;                                               \
+    for (__clz_c = 0;                                                   \
+         (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
+         __clz_c += 8)                                                  \
+      __clz_x <<= 8;                                                    \
+    for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
+      __clz_x <<= 1;                                                    \
+    (count) = __clz_c;                                                  \
+  } while (0)
+#endif
+
+#ifndef count_trailing_zeros
+# define count_trailing_zeros(count, x) do {                            \
+    uintmax_t __ctz_x = (x);                                            \
+    unsigned int __ctz_c = 0;                                           \
+    while ((__ctz_x & 1) == 0)                                          \
+      {                                                                 \
+        __ctz_x >>= 1;                                                  \
+        __ctz_c++;                                                      \
+      }                                                                 \
+    (count) = __ctz_c;                                                  \
+  } while (0)
+#endif
+
+/* Requires that a < n and b <= n */
+#define submod(r,a,b,n)                                                 \
+  do {                                                                  \
+    uintmax_t _t = - (uintmax_t) (a < b);                               \
+    (r) = ((n) & _t) + (a) - (b);                                       \
+  } while (0)
+
+#define addmod(r,a,b,n)                                                 \
+  submod ((r), (a), ((n) - (b)), (n))
+
+/* Modular two-word addition and subtraction.  For performance reasons, the
+   most significant bit of n1 must be clear.  The destination variables must be
+   distinct from the mod operand.  */
+#define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
+  do {                                                                  \
+    add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
+    if (ge2 ((r1), (r0), (n1), (n0)))                                   \
+      sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
+  } while (0)
+#define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
+  do {                                                                  \
+    sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
+    if ((intmax_t) (r1) < 0)                                            \
+      add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
+  } while (0)
+
+#define HIGHBIT_TO_MASK(x)                                              \
+  (((intmax_t)-1 >> 1) < 0                                              \
+   ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
+   : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
+      ? UINTMAX_MAX : (uintmax_t) 0))
+
+/* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
+   Requires that d1 != 0.  */
+static uintmax_t
+mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
+{
+  int cntd, cnta;
+
+  assert (d1 != 0);
+
+  if (a1 == 0)
+    {
+      *r1 = 0;
+      return a0;
+    }
+
+  count_leading_zeros (cntd, d1);
+  count_leading_zeros (cnta, a1);
+  int cnt = cntd - cnta;
+  lsh2 (d1, d0, d1, d0, cnt);
+  for (int i = 0; i < cnt; i++)
+    {
+      if (ge2 (a1, a0, d1, d0))
+        sub_ddmmss (a1, a0, a1, a0, d1, d0);
+      rsh2 (d1, d0, d1, d0, 1);
+    }
+
+  *r1 = a1;
+  return a0;
+}
+
+static uintmax_t _GL_ATTRIBUTE_CONST
+gcd_odd (uintmax_t a, uintmax_t b)
+{
+  if ( (b & 1) == 0)
+    {
+      uintmax_t t = b;
+      b = a;
+      a = t;
+    }
+  if (a == 0)
+    return b;
+
+  /* Take out least significant one bit, to make room for sign */
+  b >>= 1;
+
+  for (;;)
+    {
+      uintmax_t t;
+      uintmax_t bgta;
+
+      while ((a & 1) == 0)
+        a >>= 1;
+      a >>= 1;
+
+      t = a - b;
+      if (t == 0)
+        return (a << 1) + 1;
+
+      bgta = HIGHBIT_TO_MASK (t);
+
+      /* b <-- min (a, b) */
+      b += (bgta & t);
+
+      /* a <-- |a - b| */
+      a = (t ^ bgta) - bgta;
+    }
+}
+
+static uintmax_t
+gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t 
b0)
+{
+  while ((a0 & 1) == 0)
+    rsh2 (a1, a0, a1, a0, 1);
+  while ((b0 & 1) == 0)
+    rsh2 (b1, b0, b1, b0, 1);
+
+  for (;;)
+    {
+      if ((b1 | a1) == 0)
+        {
+          *r1 = 0;
+          return gcd_odd (b0, a0);
+        }
+
+      if (gt2 (a1, a0, b1, b0))
+        {
+          sub_ddmmss (a1, a0, a1, a0, b1, b0);
+          do
+            rsh2 (a1, a0, a1, a0, 1);
+          while ((a0 & 1) == 0);
+        }
+      else if (gt2 (b1, b0, a1, a0))
+        {
+          sub_ddmmss (b1, b0, b1, b0, a1, a0);
+          do
+            rsh2 (b1, b0, b1, b0, 1);
+          while ((b0 & 1) == 0);
+        }
+      else
+        break;
+    }
+
+  *r1 = a1;
+  return a0;
+}
+
+static void
+factor_insert_multiplicity (struct factors *factors,
+                            uintmax_t prime, unsigned int m)
+{
+  unsigned int nfactors = factors->nfactors;
+  uintmax_t *p = factors->p;
+  unsigned char *e = factors->e;
+
+  /* Locate position for insert new or increment e.  */
+  int i;
+  for (i = nfactors - 1; i >= 0; i--)
+    {
+      if (p[i] <= prime)
+        break;
+    }
+
+  if (i < 0 || p[i] != prime)
+    {
+      for (int j = nfactors - 1; j > i; j--)
+        {
+          p[j + 1] = p[j];
+          e[j + 1] = e[j];
+        }
+      p[i + 1] = prime;
+      e[i + 1] = m;
+      factors->nfactors = nfactors + 1;
+    }
+  else
+    {
+      e[i] += m;
+    }
+}
+
+#define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
+
+static void
+factor_insert_large (struct factors *factors,
+                     uintmax_t p1, uintmax_t p0)
+{
+  if (p1 > 0)
+    {
+      assert (factors->plarge[1] == 0);
+      factors->plarge[0] = p0;
+      factors->plarge[1] = p1;
+    }
+  else
+    factor_insert (factors, p0);
+}
+
+#if HAVE_GMP
+static void mp_factor (mpz_t, struct mp_factors *);
+
+static void
+mp_factor_init (struct mp_factors *factors)
+{
+  factors->p = xmalloc (1);
+  factors->e = xmalloc (1);
+  factors->nfactors = 0;
+}
+
+static void
+mp_factor_clear (struct mp_factors *factors)
+{
+  for (unsigned int i = 0; i < factors->nfactors; i++)
+    mpz_clear (factors->p[i]);
+
+  free (factors->p);
+  free (factors->e);
+}
+
+static void
+mp_factor_insert (struct mp_factors *factors, mpz_t prime)
+{
+  unsigned long int nfactors = factors->nfactors;
+  mpz_t         *p  = factors->p;
+  unsigned long int *e  = factors->e;
+  long i;
+
+  /* Locate position for insert new or increment e.  */
+  for (i = nfactors - 1; i >= 0; i--)
+    {
+      if (mpz_cmp (p[i], prime) <= 0)
+        break;
+    }
+
+  if (i < 0 || mpz_cmp (p[i], prime) != 0)
+    {
+      p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
+      e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
+
+      mpz_init (p[nfactors]);
+      for (long j = nfactors - 1; j > i; j--)
+        {
+          mpz_set (p[j + 1], p[j]);
+          e[j + 1] = e[j];
+        }
+      mpz_set (p[i + 1], prime);
+      e[i + 1] = 1;
+
+      factors->p = p;
+      factors->e = e;
+      factors->nfactors = nfactors + 1;
+    }
+  else
+    {
+      e[i] += 1;
+    }
+}
+
+static void
+mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
+{
+  mpz_t pz;
+
+  mpz_init_set_ui (pz, prime);
+  mp_factor_insert (factors, pz);
+  mpz_clear (pz);
+}
+#endif /* HAVE_GMP */
+
+
+#define P(a,b,c,d) a,
+static const unsigned char primes_diff[] = {
+#include "primes.h"
+0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+#define PRIMES_PTAB_ENTRIES \
+  (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
+
+#define P(a,b,c,d) b,
+static const unsigned char primes_diff8[] = {
+#include "primes.h"
+0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+struct primes_dtab
+{
+  uintmax_t binv, lim;
+};
+
+#define P(a,b,c,d) {c,d},
+static const struct primes_dtab primes_dtab[] = {
+#include "primes.h"
+{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
+};
+#undef P
+
+/* This flag is honored only in the GMP code. */
+static int verbose = 0;
+
+/* Prove primality or run probabilistic tests.  */
+static bool flag_prove_primality = true;
+
+/* Number of Miller-Rabin tests to run when not proving primality. */
+#define MR_REPS 25
+
+#ifdef __GNUC__
+# define LIKELY(cond)    __builtin_expect ((cond), 1)
+# define UNLIKELY(cond)  __builtin_expect ((cond), 0)
+#else
+# define LIKELY(cond)    (cond)
+# define UNLIKELY(cond)  (cond)
+#endif

 static void
 debug (char const *fmt, ...)
@@ -65,368 +650,1637 @@ debug (char const *fmt, ...)
 }

 static void
-emit_factor (mpz_t n)
+factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
+                      unsigned int off)
 {
-  if (nfactors_found == nfactors_allocated)
-    factor = X2NREALLOC (factor, &nfactors_allocated);
-  mpz_init (factor[nfactors_found]);
-  mpz_set (factor[nfactors_found], n);
-  ++nfactors_found;
+  for (unsigned int j = 0; j < off; j++)
+    p += primes_diff[i + j];
+  factor_insert (factors, p);
 }

+/* Trial division with odd primes uses the following trick.
+
+   Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
+   consider the case t < B (this is the second loop below).
+
+   From our tables we get
+
+     binv = p^{-1} (mod B)
+     lim = floor ( (B-1) / p ).
+
+   First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
+   (and all quotients in this range occur for some t).
+
+   Then t = q * p is true also (mod B), and p is invertible we get
+
+     q = t * binv (mod B).
+
+   Next, assume that t is *not* divisible by p. Since multiplication
+   by binv (mod B) is a one-to-one mapping,
+
+     t * binv (mod B) > lim,
+
+   because all the smaller values are already taken.
+
+   This can be summed up by saying that the function
+
+     q(t) = binv * t (mod B)
+
+   is a permutation of the range 0 <= t < B, with the curious property
+   that it maps the multiples of p onto the range 0 <= q <= lim, in
+   order, and the non-multiples of p onto the range lim < q < B.
+ */
+
+static uintmax_t
+factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
+                       struct factors *factors)
+{
+  if (t0 % 2 == 0)
+    {
+      unsigned int cnt;
+
+      if (t0 == 0)
+        {
+          count_trailing_zeros (cnt, t1);
+          t0 = t1 >> cnt;
+          t1 = 0;
+          cnt += W_TYPE_SIZE;
+        }
+      else
+        {
+          count_trailing_zeros (cnt, t0);
+          rsh2 (t1, t0, t1, t0, cnt);
+        }
+
+      factor_insert_multiplicity (factors, 2, cnt);
+    }
+
+  uintmax_t p = 3;
+  unsigned int i;
+  for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
+    {
+      for (;;)
+        {
+          uintmax_t q1, q0, hi, lo;
+
+          q0 = t0 * primes_dtab[i].binv;
+          umul_ppmm (hi, lo, q0, p);
+          if (hi > t1)
+            break;
+          hi = t1 - hi;
+          q1 = hi * primes_dtab[i].binv;
+          if (LIKELY (q1 > primes_dtab[i].lim))
+            break;
+          t1 = q1; t0 = q0;
+          factor_insert (factors, p);
+        }
+      p += primes_diff[i + 1];
+    }
+  if (t1p)
+    *t1p = t1;
+
+#define DIVBLOCK(I)                                                     \
+  do {                                                                  \
+    for (;;)                                                            \
+      {                                                                 \
+        q = t0 * pd[I].binv;                                            \
+        if (LIKELY (q > pd[I].lim))                                     \
+          break;                                                        \
+        t0 = q;                                                         \
+        factor_insert_refind (factors, p, i + 1, I);                    \
+      }                                                                 \
+  } while (0)
+
+  for (; i < PRIMES_PTAB_ENTRIES; i += 8)
+    {
+      uintmax_t q;
+      const struct primes_dtab *pd = &primes_dtab[i];
+      DIVBLOCK (0);
+      DIVBLOCK (1);
+      DIVBLOCK (2);
+      DIVBLOCK (3);
+      DIVBLOCK (4);
+      DIVBLOCK (5);
+      DIVBLOCK (6);
+      DIVBLOCK (7);
+
+      p += primes_diff8[i];
+      if (p * p > t0)
+        break;
+    }
+
+  return t0;
+}
+
+#if HAVE_GMP
 static void
-emit_ul_factor (unsigned long int i)
+mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 {
-  mpz_t t;
-  mpz_init (t);
-  mpz_set_ui (t, i);
-  emit_factor (t);
-  mpz_clear (t);
+  mpz_t q;
+  unsigned long int p;
+
+  debug ("[trial division] ");
+
+  mpz_init (q);
+
+  p = mpz_scan1 (t, 0);
+  mpz_div_2exp (t, t, p);
+  while (p)
+    {
+      mp_factor_insert_ui (factors, 2);
+      --p;
+    }
+
+  p = 3;
+  for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
+    {
+      if (! mpz_divisible_ui_p (t, p))
+        {
+          p += primes_diff[i++];
+          if (mpz_cmp_ui (t, p * p) < 0)
+            break;
+        }
+      else
+        {
+          mpz_tdiv_q_ui (t, t, p);
+          mp_factor_insert_ui (factors, p);
+        }
+    }
+
+  mpz_clear (q);
+}
+#endif
+
+/* Entry i contains (2i+1)^(-1) mod 2^8.  */
+static const unsigned char  binvert_table[128] =
+{
+  0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
+  0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
+  0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
+  0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
+  0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
+  0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
+  0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
+  0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
+  0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
+  0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
+  0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
+  0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
+  0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
+  0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
+  0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
+  0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
+};
+
+/* Compute n^(-1) mod B, using a Newton iteration.  */
+#define binv(inv,n)                                                     \
+  do {                                                                  \
+    uintmax_t  __n = (n);                                               \
+    uintmax_t  __inv;                                                   \
+                                                                        \
+    __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
+    if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
+    if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
+    if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
+                                                                        \
+    if (W_TYPE_SIZE > 64)                                               \
+      {                                                                 \
+        int  __invbits = 64;                                            \
+        do {                                                            \
+          __inv = 2 * __inv - __inv * __inv * __n;                      \
+          __invbits *= 2;                                               \
+        } while (__invbits < W_TYPE_SIZE);                              \
+      }                                                                 \
+                                                                        \
+    (inv) = __inv;                                                      \
+  } while (0)
+
+/* q = u / d, assuming d|u.  */
+#define divexact_21(q1, q0, u1, u0, d)                                  \
+  do {                                                                  \
+    uintmax_t _di, _q0;                                                 \
+    binv (_di, (d));                                                    \
+    _q0 = (u0) * _di;                                                   \
+    if ((u1) >= (d))                                                    \
+      {                                                                 \
+        uintmax_t _p1, _p0;                                             \
+        umul_ppmm (_p1, _p0, _q0, d);                                   \
+        (q1) = ((u1) - _p1) * _di;                                      \
+        (q0) = _q0;                                                     \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        (q0) = _q0;                                                     \
+        (q1) = 0;                                                       \
+      }                                                                 \
+  } while (0)
+
+/* x B (mod n). */
+#define redcify(r_prim, r, n)                                           \
+  do {                                                                  \
+    uintmax_t _redcify_q ATTRIBUTE_UNUSED;                              \
+    udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
+  } while (0)
+
+/* x B^2 (mod n). Requires x > 0, n1 < B/2 */
+#define redcify2(r1, r0, x, n1, n0)                                     \
+  do {                                                                  \
+    uintmax_t _r1, _r0, _i;                                             \
+    if ((x) < (n1))                                                     \
+      {                                                                 \
+        _r1 = (x); _r0 = 0;                                             \
+        _i = W_TYPE_SIZE;                                               \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        _r1 = 0; _r0 = (x);                                             \
+        _i = 2*W_TYPE_SIZE;                                             \
+      }                                                                 \
+    while (_i-- > 0)                                                    \
+      {                                                                 \
+        lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
+        if (ge2 (_r1, _r0, (n1), (n0)))                                 \
+          sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
+      }                                                                 \
+    (r1) = _r1;                                                         \
+    (r0) = _r0;                                                         \
+  } while (0)
+
+/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
+   Both a and b must be in redc form, the result will be in redc form too. */
+static inline uintmax_t
+mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
+{
+  uintmax_t rh, rl, q, th, tl, xh;
+
+  umul_ppmm (rh, rl, a, b);
+  q = rl * mi;
+  umul_ppmm (th, tl, q, m);
+  xh = rh - th;
+  if (rh < th)
+    xh += m;
+
+  return xh;
+}
+
+/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
+   Both a and b must be in redc form, the result will be in redc form too.
+   For performance reasons, the most significant bit of m must be clear. */
+static uintmax_t
+mulredc2 (uintmax_t *r1p,
+          uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
+          uintmax_t m1, uintmax_t m0, uintmax_t mi)
+{
+  uintmax_t r1, r0, q, p1, p0, t1, t0, s1, s0;
+  mi = -mi;
+  assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
+  assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
+  assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
+
+  /* First compute a0 * <b1, b0> B^{-1}
+        +-----+
+        |a0 b0|
+     +--+--+--+
+     |a0 b1|
+     +--+--+--+
+        |q0 m0|
+     +--+--+--+
+     |q0 m1|
+    -+--+--+--+
+     |r1|r0| 0|
+     +--+--+--+
+  */
+  umul_ppmm (t1, t0, a0, b0);
+  umul_ppmm (r1, r0, a0, b1);
+  q = mi * t0;
+  umul_ppmm (p1, p0, q, m0);
+  umul_ppmm (s1, s0, q, m1);
+  r0 += (t0 != 0); /* Carry */
+  add_ssaaaa (r1, r0, r1, r0, 0, p1);
+  add_ssaaaa (r1, r0, r1, r0, 0, t1);
+  add_ssaaaa (r1, r0, r1, r0, s1, s0);
+
+  /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
+        +-----+
+        |a1 b0|
+        +--+--+
+        |r1|r0|
+     +--+--+--+
+     |a1 b1|
+     +--+--+--+
+        |q1 m0|
+     +--+--+--+
+     |q1 m1|
+    -+--+--+--+
+     |r1|r0| 0|
+     +--+--+--+
+  */
+  umul_ppmm (t1, t0, a1, b0);
+  umul_ppmm (s1, s0, a1, b1);
+  add_ssaaaa (t1, t0, t1, t0, 0, r0);
+  q = mi * t0;
+  add_ssaaaa (r1, r0, s1, s0, 0, r1);
+  umul_ppmm (p1, p0, q, m0);
+  umul_ppmm (s1, s0, q, m1);
+  r0 += (t0 != 0); /* Carry */
+  add_ssaaaa (r1, r0, r1, r0, 0, p1);
+  add_ssaaaa (r1, r0, r1, r0, 0, t1);
+  add_ssaaaa (r1, r0, r1, r0, s1, s0);
+
+  if (ge2 (r1, r0, m1, m0))
+    sub_ddmmss (r1, r0, r1, r0, m1, m0);
+
+  *r1p = r1;
+  return r0;
+}
+
+static uintmax_t _GL_ATTRIBUTE_CONST
+powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
+{
+  uintmax_t y = one;
+
+  if (e & 1)
+    y = b;
+
+  while (e != 0)
+    {
+      b = mulredc (b, b, n, ni);
+      e >>= 1;
+
+      if (e & 1)
+        y = mulredc (y, b, n, ni);
+    }
+
+  return y;
+}
+
+static uintmax_t
+powm2 (uintmax_t *r1m,
+       const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
+       uintmax_t ni, const uintmax_t *one)
+{
+  uintmax_t r1, r0, b1, b0, n1, n0;
+  unsigned int i;
+  uintmax_t e;
+
+  b0 = bp[0];
+  b1 = bp[1];
+  n0 = np[0];
+  n1 = np[1];
+
+  r0 = one[0];
+  r1 = one[1];
+
+  for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
+    {
+      if (e & 1)
+        {
+          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
+          r1 = *r1m;
+        }
+      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
+      b1 = *r1m;
+    }
+  for (e = ep[1]; e > 0; e >>= 1)
+    {
+      if (e & 1)
+        {
+          r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
+          r1 = *r1m;
+        }
+      b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
+      b1 = *r1m;
+    }
+  *r1m = r1;
+  return r0;
+}
+
+static bool _GL_ATTRIBUTE_CONST
+millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
+             unsigned int k, uintmax_t one)
+{
+  uintmax_t y = powm (b, q, n, ni, one);
+
+  uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
+
+  if (y == one || y == nm1)
+    return true;
+
+  for (unsigned int i = 1; i < k; i++)
+    {
+      y = mulredc (y, y, n, ni);
+
+      if (y == nm1)
+        return true;
+      if (y == one)
+        return false;
+    }
+  return false;
+}
+
+static bool
+millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
+              const uintmax_t *qp, unsigned int k, const uintmax_t *one)
+{
+  uintmax_t y1, y0, nm1_1, nm1_0, r1m;
+
+  y0 = powm2 (&r1m, bp, qp, np, ni, one);
+  y1 = r1m;
+
+  if (y0 == one[0] && y1 == one[1])
+    return true;
+
+  sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
+
+  if (y0 == nm1_0 && y1 == nm1_1)
+    return true;
+
+  for (unsigned int i = 1; i < k; i++)
+    {
+      y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
+      y1 = r1m;
+
+      if (y0 == nm1_0 && y1 == nm1_1)
+        return true;
+      if (y0 == one[0] && y1 == one[1])
+        return false;
+    }
+  return false;
+}
+
+#if HAVE_GMP
+static bool
+mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
+                mpz_srcptr q, unsigned long int k)
+{
+  mpz_powm (y, x, q, n);
+
+  if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
+    return true;
+
+  for (unsigned long int i = 1; i < k; i++)
+    {
+      mpz_powm_ui (y, y, 2, n);
+      if (mpz_cmp (y, nm1) == 0)
+        return true;
+      if (mpz_cmp_ui (y, 1) == 0)
+        return false;
+    }
+  return false;
+}
+#endif
+
+/* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
+   have been observed.  The average seem to be about 2.  */
+static bool
+prime_p (uintmax_t n)
+{
+  int k;
+  bool is_prime;
+  uintmax_t a_prim, one, ni;
+  struct factors factors;
+
+  if (n <= 1)
+    return false;
+
+  /* We have already casted out small primes. */
+  if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
+    return true;
+
+  /* Precomputation for Miller-Rabin.  */
+  uintmax_t q = n - 1;
+  for (k = 0; (q & 1) == 0; k++)
+    q >>= 1;
+
+  uintmax_t a = 2;
+  binv (ni, n);                 /* ni <- 1/n mod B */
+  redcify (one, 1, n);
+  addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
+
+  /* Perform a Miller-Rabin test, finds most composites quickly.  */
+  if (!millerrabin (n, ni, a_prim, q, k, one))
+    return false;
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      factor (0, n - 1, &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              is_prime
+                = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        return true;
+
+      a += primes_diff[r];      /* Establish new base.  */
+
+      /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
+         on most processors, since it avoids udiv_qrnnd.  If we go down the
+         udiv_qrnnd_preinv path, this code should be replaced.  */
+      {
+        uintmax_t s1, s0;
+        umul_ppmm (s1, s0, one, a);
+        if (LIKELY (s1 == 0))
+          a_prim = s0 % n;
+        else
+          {
+            uintmax_t dummy ATTRIBUTE_UNUSED;
+            udiv_qrnnd (dummy, a_prim, s1, s0, n);
+          }
+      }
+
+      if (!millerrabin (n, ni, a_prim, q, k, one))
+        return false;
+    }
+
+  error (0, 0, _("Lucas prime test failure.  This should not happen"));
+  abort ();
+}
+
+static bool
+prime2_p (uintmax_t n1, uintmax_t n0)
+{
+  uintmax_t q[2], nm1[2];
+  uintmax_t a_prim[2];
+  uintmax_t one[2];
+  uintmax_t na[2];
+  uintmax_t ni;
+  unsigned int k;
+  struct factors factors;
+
+  if (n1 == 0)
+    return prime_p (n0);
+
+  nm1[1] = n1 - (n0 == 0);
+  nm1[0] = n0 - 1;
+  if (nm1[0] == 0)
+    {
+      count_trailing_zeros (k, nm1[1]);
+
+      q[0] = nm1[1] >> k;
+      q[1] = 0;
+      k += W_TYPE_SIZE;
+    }
+  else
+    {
+      count_trailing_zeros (k, nm1[0]);
+      rsh2 (q[1], q[0], nm1[1], nm1[0], k);
+    }
+
+  uintmax_t a = 2;
+  binv (ni, n0);
+  redcify2 (one[1], one[0], 1, n1, n0);
+  addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
+
+  /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
+  na[0] = n0;
+  na[1] = n1;
+
+  if (!millerrabin2 (na, ni, a_prim, q, k, one))
+    return false;
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      factor (nm1[1], nm1[0], &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      bool is_prime;
+      uintmax_t e[2], y[2];
+
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          if (factors.plarge[1])
+            {
+              uintmax_t pi;
+              binv (pi, factors.plarge[0]);
+              e[0] = pi * nm1[0];
+              e[1] = 0;
+              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
+              is_prime = (y[0] != one[0] || y[1] != one[1]);
+            }
+          for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              /* FIXME: We always have the factor 2. Do we really need to
+                 handle it here? We have done the same powering as part
+                 of millerrabin. */
+              if (factors.p[i] == 2)
+                rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
+              else
+                divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
+              y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
+              is_prime = (y[0] != one[0] || y[1] != one[1]);
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        return true;
+
+      a += primes_diff[r];      /* Establish new base.  */
+      redcify2 (a_prim[1], a_prim[0], a, n1, n0);
+
+      if (!millerrabin2 (na, ni, a_prim, q, k, one))
+        return false;
+    }
+
+  error (0, 0, _("Lucas prime test failure.  This should not happen"));
+  abort ();
+}
+
+#if HAVE_GMP
+static bool
+mp_prime_p (mpz_t n)
+{
+  bool is_prime;
+  mpz_t q, a, nm1, tmp;
+  struct mp_factors factors;
+
+  if (mpz_cmp_ui (n, 1) <= 0)
+    return false;
+
+  /* We have already casted out small primes. */
+  if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
+    return true;
+
+  mpz_inits (q, a, nm1, tmp, NULL);
+
+  /* Precomputation for Miller-Rabin.  */
+  mpz_sub_ui (nm1, n, 1);
+
+  /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
+  unsigned long int k = mpz_scan1 (nm1, 0);
+  mpz_tdiv_q_2exp (q, nm1, k);
+
+  mpz_set_ui (a, 2);
+
+  /* Perform a Miller-Rabin test, finds most composites quickly.  */
+  if (!mp_millerrabin (n, nm1, a, tmp, q, k))
+    {
+      is_prime = false;
+      goto ret2;
+    }
+
+  if (flag_prove_primality)
+    {
+      /* Factor n-1 for Lucas.  */
+      mpz_set (tmp, nm1);
+      mp_factor (tmp, &factors);
+    }
+
+  /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
+     number composite.  */
+  for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
+    {
+      if (flag_prove_primality)
+        {
+          is_prime = true;
+          for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
+            {
+              mpz_divexact (tmp, nm1, factors.p[i]);
+              mpz_powm (tmp, a, tmp, n);
+              is_prime = mpz_cmp_ui (tmp, 1) != 0;
+            }
+        }
+      else
+        {
+          /* After enough Miller-Rabin runs, be content. */
+          is_prime = (r == MR_REPS - 1);
+        }
+
+      if (is_prime)
+        goto ret1;
+
+      mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
+
+      if (!mp_millerrabin (n, nm1, a, tmp, q, k))
+        {
+          is_prime = false;
+          goto ret1;
+        }
+    }
+
+  error (0, 0, _("Lucas prime test failure.  This should not happen"));
+  abort ();
+
+ ret1:
+  if (flag_prove_primality)
+    mp_factor_clear (&factors);
+ ret2:
+  mpz_clears (q, a, nm1, tmp, NULL);
+
+  return is_prime;
 }
+#endif
+
+static void
+factor_using_pollard_rho (uintmax_t n, unsigned long int a,
+                          struct factors *factors)
+{
+  uintmax_t x, z, y, P, t, ni, g;
+
+  unsigned long int k = 1;
+  unsigned long int l = 1;
+
+  redcify (P, 1, n);
+  addmod (x, P, P, n);          /* i.e., redcify(2) */
+  y = z = x;
+
+  while (n != 1)
+    {
+      assert (a < n);
+
+      binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
+
+      for (;;)
+        {
+          do
+            {
+              x = mulredc (x, x, n, ni);
+              addmod (x, x, a, n);
+
+              submod (t, z, x, n);
+              P = mulredc (P, t, n, ni);
+
+              if (k % 32 == 1)
+                {
+                  if (gcd_odd (P, n) != 1)
+                    goto factor_found;
+                  y = x;
+                }
+            }
+          while (--k != 0);
+
+          z = x;
+          k = l;
+          l = 2 * l;
+          for (unsigned long int i = 0; i < k; i++)
+            {
+              x = mulredc (x, x, n, ni);
+              addmod (x, x, a, n);
+            }
+          y = x;
+        }
+
+    factor_found:
+      do
+        {
+          y = mulredc (y, y, n, ni);
+          addmod (y, y, a, n);
+
+          submod (t, z, y, n);
+          g = gcd_odd (t, n);
+        }
+      while (g == 1);
+
+      n = n / g;
+
+      if (!prime_p (g))
+        factor_using_pollard_rho (g, a + 1, factors);
+      else
+        factor_insert (factors, g);
+
+      if (prime_p (n))
+        {
+          factor_insert (factors, n);
+          break;
+        }
+
+      x = x % n;
+      z = z % n;
+      y = y % n;
+    }
+}
+
+static void
+factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
+                           struct factors *factors)
+{
+  uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
+
+  unsigned long int k = 1;
+  unsigned long int l = 1;
+
+  redcify2 (P1, P0, 1, n1, n0);
+  addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
+  y1 = z1 = x1;
+  y0 = z0 = x0;
+
+  while (n1 != 0 || n0 != 1)
+    {
+      binv (ni, n0);
+
+      for (;;)
+        {
+          do
+            {
+              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
+              x1 = r1m;
+              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
+
+              submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
+              P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
+              P1 = r1m;
+
+              if (k % 32 == 1)
+                {
+                  g0 = gcd2_odd (&g1, P1, P0, n1, n0);
+                  if (g1 != 0 || g0 != 1)
+                    goto factor_found;
+                  y1 = x1; y0 = x0;
+                }
+            }
+          while (--k != 0);
+
+          z1 = x1; z0 = x0;
+          k = l;
+          l = 2 * l;
+          for (unsigned long int i = 0; i < k; i++)
+            {
+              x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
+              x1 = r1m;
+              addmod2 (x1, x0, x1, x0, 0, a, n1, n0);
+            }
+          y1 = x1; y0 = x0;
+        }

-static void
-factor_using_division (mpz_t t, unsigned int limit)
-{
-  mpz_t q, r;
-  unsigned long int f;
-  int ai;
-  static unsigned int const add[] = {4, 2, 4, 2, 4, 6, 2, 6};
-  unsigned int const *addv = add;
-  unsigned int failures;
+    factor_found:
+      do
+        {
+          y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
+          y1 = r1m;
+          addmod2 (y1, y0, y1, y0, 0, a, n1, n0);

-  debug ("[trial division (%u)] ", limit);
+          submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
+          g0 = gcd2_odd (&g1, t1, t0, n1, n0);
+        }
+      while (g1 == 0 && g0 == 1);

-  mpz_init (q);
-  mpz_init (r);
+      if (g1 == 0)
+        {
+          /* The found factor is one word. */
+          divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */

-  f = mpz_scan1 (t, 0);
-  mpz_div_2exp (t, t, f);
-  while (f)
-    {
-      emit_ul_factor (2);
-      --f;
-    }
+          if (!prime_p (g0))
+            factor_using_pollard_rho (g0, a + 1, factors);
+          else
+            factor_insert (factors, g0);
+        }
+      else
+        {
+          /* The found factor is two words.  This is highly unlikely, thus hard
+             to trigger.  Please be careful before you change this code!  */
+          uintmax_t ginv;

-  while (true)
-    {
-      mpz_tdiv_qr_ui (q, r, t, 3);
-      if (mpz_cmp_ui (r, 0) != 0)
-        break;
-      mpz_set (t, q);
-      emit_ul_factor (3);
-    }
+          binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
+          n0 = ginv * n0;       /* fit one word, we can compute the quotient */
+          n1 = 0;               /* modulo B, ignoring the high divisor word. */

-  while (true)
-    {
-      mpz_tdiv_qr_ui (q, r, t, 5);
-      if (mpz_cmp_ui (r, 0) != 0)
-        break;
-      mpz_set (t, q);
-      emit_ul_factor (5);
-    }
+          if (!prime2_p (g1, g0))
+            factor_using_pollard_rho2 (g1, g0, a + 1, factors);
+          else
+            factor_insert_large (factors, g1, g0);
+        }

-  failures = 0;
-  f = 7;
-  ai = 0;
-  while (mpz_cmp_ui (t, 1) != 0)
-    {
-      mpz_tdiv_qr_ui (q, r, t, f);
-      if (mpz_cmp_ui (r, 0) != 0)
+      if (n1 == 0)
         {
-          f += addv[ai];
-          if (mpz_cmp_ui (q, f) < 0)
-            break;
-          ai = (ai + 1) & 7;
-          failures++;
-          if (failures > limit)
-            break;
+          if (prime_p (n0))
+            {
+              factor_insert (factors, n0);
+              break;
+            }
+
+          factor_using_pollard_rho (n0, a, factors);
+          return;
         }
-      else
+
+      if (prime2_p (n1, n0))
         {
-          mpz_swap (t, q);
-          emit_ul_factor (f);
-          failures = 0;
+          factor_insert_large (factors, n1, n0);
+          break;
         }
-    }

-  mpz_clear (q);
-  mpz_clear (r);
+      x0 = mod2 (&x1, x1, x0, n1, n0);
+      z0 = mod2 (&z1, z1, z0, n1, n0);
+      y0 = mod2 (&y1, y1, y0, n1, n0);
+    }
 }

-/* The number of Miller-Rabin tests we require.  */
-enum { MR_REPS = 25 };
-
+#if HAVE_GMP
 static void
-factor_using_pollard_rho (mpz_t n, int a_int)
+mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
+                             struct mp_factors *factors)
 {
-  mpz_t x, x1, y, P;
-  mpz_t a;
-  mpz_t g;
-  mpz_t t1, t2;
-  int k, l, c;
-
-  debug ("[pollard-rho (%d)] ", a_int);
+  mpz_t x, z, y, P;
+  mpz_t t, t2;

-  mpz_init (g);
-  mpz_init (t1);
-  mpz_init (t2);
+  debug ("[pollard-rho (%lu)] ", a);

-  mpz_init_set_si (a, a_int);
+  mpz_inits (t, t2, NULL);
   mpz_init_set_si (y, 2);
   mpz_init_set_si (x, 2);
-  mpz_init_set_si (x1, 2);
-  k = 1;
-  l = 1;
+  mpz_init_set_si (z, 2);
   mpz_init_set_ui (P, 1);
-  c = 0;
+
+  unsigned long long int k = 1;
+  unsigned long long int l = 1;

   while (mpz_cmp_ui (n, 1) != 0)
     {
-S2:
-      mpz_mul (x, x, x); mpz_add (x, x, a); mpz_mod (x, x, n);
-
-      mpz_sub (t1, x1, x); mpz_mul (t2, P, t1); mpz_mod (P, t2, n);
-      c++;
-      if (c == 20)
+      for (;;)
         {
-          c = 0;
-          mpz_gcd (g, P, n);
-          if (mpz_cmp_ui (g, 1) != 0)
-            goto S4;
+          do
+            {
+              mpz_mul (t, x, x);
+              mpz_mod (x, t, n);
+              mpz_add_ui (x, x, a);
+
+              mpz_sub (t, z, x);
+              mpz_mul (t2, P, t);
+              mpz_mod (P, t2, n);
+
+              if (k % 32 == 1)
+                {
+                  mpz_gcd (t, P, n);
+                  if (mpz_cmp_ui (t, 1) != 0)
+                    goto factor_found;
+                  mpz_set (y, x);
+                }
+            }
+          while (--k != 0);
+
+          mpz_set (z, x);
+          k = l;
+          l = 2 * l;
+          for (unsigned long long int i = 0; i < k; i++)
+            {
+              mpz_mul (t, x, x);
+              mpz_mod (x, t, n);
+              mpz_add_ui (x, x, a);
+            }
           mpz_set (y, x);
         }

-      k--;
-      if (k > 0)
-        goto S2;
-
-      mpz_gcd (g, P, n);
-      if (mpz_cmp_ui (g, 1) != 0)
-        goto S4;
-
-      mpz_set (x1, x);
-      k = l;
-      l = 2 * l;
-      unsigned int i;
-      for (i = 0; i < k; i++)
-        {
-          mpz_mul (x, x, x); mpz_add (x, x, a); mpz_mod (x, x, n);
-        }
-      mpz_set (y, x);
-      c = 0;
-      goto S2;
-S4:
+    factor_found:
       do
         {
-          mpz_mul (y, y, y); mpz_add (y, y, a); mpz_mod (y, y, n);
-          mpz_sub (t1, x1, y); mpz_gcd (g, t1, n);
+          mpz_mul (t, y, y);
+          mpz_mod (y, t, n);
+          mpz_add_ui (y, y, a);
+
+          mpz_sub (t, z, y);
+          mpz_gcd (t, t, n);
         }
-      while (mpz_cmp_ui (g, 1) == 0);
+      while (mpz_cmp_ui (t, 1) == 0);

-      mpz_div (n, n, g);       /* divide by g, before g is overwritten */
+      mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */

-      if (!mpz_probab_prime_p (g, MR_REPS))
+      if (!mp_prime_p (t))
         {
-          do
-            {
-              mp_limb_t a_limb;
-              mpn_random (&a_limb, (mp_size_t) 1);
-              a_int = (int) a_limb;
-            }
-          while (a_int == -2 || a_int == 0);
-
           debug ("[composite factor--restarting pollard-rho] ");
-          factor_using_pollard_rho (g, a_int);
+          mp_factor_using_pollard_rho (t, a + 1, factors);
         }
       else
         {
-          emit_factor (g);
+          mp_factor_insert (factors, t);
         }
-      mpz_mod (x, x, n);
-      mpz_mod (x1, x1, n);
-      mpz_mod (y, y, n);
-      if (mpz_probab_prime_p (n, MR_REPS))
+
+      if (mp_prime_p (n))
         {
-          emit_factor (n);
+          mp_factor_insert (factors, n);
           break;
         }
+
+      mpz_mod (x, x, n);
+      mpz_mod (z, z, n);
+      mpz_mod (y, y, n);
     }

-  mpz_clear (g);
-  mpz_clear (P);
-  mpz_clear (t2);
-  mpz_clear (t1);
-  mpz_clear (a);
-  mpz_clear (x1);
-  mpz_clear (x);
-  mpz_clear (y);
+  mpz_clears (P, t2, t, z, x, y, NULL);
 }
+#endif

-#else
-
-static void
-debug (char const *fmt ATTRIBUTE_UNUSED, ...)
+/* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
+   algorithm is replaced, consider also returning the remainder. */
+static uintmax_t _GL_ATTRIBUTE_CONST
+isqrt (uintmax_t n)
 {
+  uintmax_t x;
+  unsigned c;
+  if (n == 0)
+    return 0;
+
+  count_leading_zeros (c, n);
+
+  /* Make x > sqrt(n). This will be invariant through the loop. */
+  x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
+
+  for (;;)
+    {
+      uintmax_t y = (x + n/x) / 2;
+      if (y >= x)
+        return x;
+
+      x = y;
+    }
 }

-#endif
+static uintmax_t _GL_ATTRIBUTE_CONST
+isqrt2 (uintmax_t nh, uintmax_t nl)
+{
+  unsigned int shift;
+  uintmax_t x;
+
+  /* Ensures the remainder fits in an uintmax_t. */
+  assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
+
+  if (nh == 0)
+    return isqrt (nl);
+
+  count_leading_zeros (shift, nh);
+  shift &= ~1;
+
+  /* Make x > sqrt(n) */
+  x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
+  x <<= (W_TYPE_SIZE - shift) / 2;

-/* The maximum number of factors, including -1, for negative numbers.  */
-#define MAX_N_FACTORS (sizeof (uintmax_t) * CHAR_BIT)
+  /* Do we need more than one iteration? */
+  for (;;)
+    {
+      uintmax_t r ATTRIBUTE_UNUSED;
+      uintmax_t q, y;
+      udiv_qrnnd (q, r, nh, nl, x);
+      y = (x + q) / 2;
+
+      if (y >= x)
+        {
+          uintmax_t hi, lo;
+          umul_ppmm (hi, lo, x + 1, x + 1);
+          assert (gt2 (hi, lo, nh, nl));
+
+          umul_ppmm (hi, lo, x, x);
+          assert (ge2 (nh, nl, hi, lo));
+          sub_ddmmss (hi, lo, nh, nl, hi, lo);
+          assert (hi == 0);
+
+          return x;
+        }
+
+      x = y;
+    }
+}
+
+/* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
+#define MAGIC64 ((uint64_t) 0x0202021202030213ULL)
+#define MAGIC63 ((uint64_t) 0x0402483012450293ULL)
+#define MAGIC65 ((uint64_t) 0x218a019866014613ULL)
+#define MAGIC11 0x23b

-/* The trial divisor increment wheel.  Use it to skip over divisors that
-   are composites of 2, 3, 5, 7, or 11.  The part from WHEEL_START up to
-   WHEEL_END is reused periodically, while the "lead in" is used to test
-   for those primes and to jump onto the wheel.  For more information, see
-   http://www.utm.edu/research/primes/glossary/WheelFactorization.html  */
+/* Returns the square root if the input is a square, otherwise 0. */
+static uintmax_t _GL_ATTRIBUTE_CONST
+is_square (uintmax_t x)
+{
+  /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
+     computing the square root. */
+  if (((MAGIC64 >> (x & 63)) & 1)
+      && ((MAGIC63 >> (x % 63)) & 1)
+      /* Both 0 and 64 are squares mod (65) */
+      && ((MAGIC65 >> ((x % 65) & 63)) & 1)
+      && ((MAGIC11 >> (x % 11) & 1)))
+    {
+      uintmax_t r = isqrt (x);
+      if (r*r == x)
+        return r;
+    }
+  return 0;
+}

-#include "wheel-size.h"  /* For the definition of WHEEL_SIZE.  */
-static const unsigned char wheel_tab[] =
+/* invtab[i] = floor(0x10000 / (0x100 + i) */
+static const unsigned short invtab[0x81] =
   {
-#include "wheel.h"
+    0x200,
+    0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
+    0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
+    0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
+    0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
+    0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
+    0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
+    0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
+    0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
+    0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
+    0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
+    0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
+    0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
+    0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
+    0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
+    0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
+    0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
   };

-#define WHEEL_START (wheel_tab + WHEEL_SIZE)
-#define WHEEL_END (wheel_tab + ARRAY_CARDINALITY (wheel_tab))
+/* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
+   that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
+#define div_smallq(q, r, u, d)                                          \
+  do {                                                                  \
+    if ((u) / 0x40 < (d))                                               \
+      {                                                                 \
+        int _cnt;                                                       \
+        uintmax_t _dinv, _mask, _q, _r;                                 \
+        count_leading_zeros (_cnt, (d));                                \
+        _r = (u);                                                       \
+        if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
+          {                                                             \
+            _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
+            _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
+          }                                                             \
+        else                                                            \
+          {                                                             \
+            _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
+            _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
+          }                                                             \
+        _r -= _q*(d);                                                   \
+                                                                        \
+        _mask = -(uintmax_t) (_r >= (d));                               \
+        (r) = _r - (_mask & (d));                                       \
+        (q) = _q - _mask;                                               \
+        assert ( (q) * (d) + (r) == u);                                 \
+      }                                                                 \
+    else                                                                \
+      {                                                                 \
+        uintmax_t _q = (u) / (d);                                       \
+        (r) = (u) - _q * (d);                                           \
+        (q) = _q;                                                       \
+      }                                                                 \
+  } while (0)
+
+/* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
+   = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
+   with 3025 = 55^2.
+
+   Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
+   representing G_0 = (-55, 2*4652, 8653).
+
+   In the notation of the paper:
+
+   S_{-1} = 55, S_0 = 8653, R_0 = 4652
+
+   Put
+
+     t_0 = floor([q_0 + R_0] / S0) = 1
+     R_1 = t_0 * S_0 - R_0 = 4001
+     S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
+*/
+
+/* Multipliers, in order of efficiency:
+   0.7268  3*5*7*11 = 1155 = 3 (mod 4)
+   0.7317  3*5*7    =  105 = 1
+   0.7820  3*5*11   =  165 = 1
+   0.7872  3*5      =   15 = 3
+   0.8101  3*7*11   =  231 = 3
+   0.8155  3*7      =   21 = 1
+   0.8284  5*7*11   =  385 = 1
+   0.8339  5*7      =   35 = 3
+   0.8716  3*11     =   33 = 1
+   0.8774  3        =    3 = 3
+   0.8913  5*11     =   55 = 3
+   0.8972  5        =    5 = 1
+   0.9233  7*11     =   77 = 1
+   0.9295  7        =    7 = 3
+   0.9934  11       =   11 = 3
+*/
+#define QUEUE_SIZE 50

-/* FIXME: comment */
+#if STAT_SQUFOF
+# define Q_FREQ_SIZE 50
+/* Element 0 keeps the total */
+static unsigned int q_freq[Q_FREQ_SIZE + 1];
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif

-static size_t
-factor_wheel (uintmax_t n0, size_t max_n_factors, uintmax_t *factors)
+/* Return true on success.  Expected to fail only for numbers
+   >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
+static bool
+factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
 {
-  uintmax_t n = n0, d, q;
-  size_t n_factors = 0;
-  unsigned char const *w = wheel_tab;
+  /* Uses algorithm and notation from

-  if (n <= 1)
-    return n_factors;
+     SQUARE FORM FACTORIZATION
+     JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
+
+     http://homes.cerias.purdue.edu/~ssw/squfof.pdf
+   */
+
+  static const unsigned int multipliers_1[] =
+    { /* = 1 (mod 4) */
+      105, 165, 21, 385, 33, 5, 77, 1, 0
+    };
+  static const unsigned int multipliers_3[] =
+    { /* = 3 (mod 4) */
+      1155, 15, 231, 35, 3, 55, 7, 11, 0
+    };
+
+  const unsigned int *m;
+
+  struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];

-  /* The exit condition in the following loop is correct because
-     any time it is tested one of these 3 conditions holds:
-      (1) d divides n
-      (2) n is prime
-      (3) n is composite but has no factors less than d.
-     If (1) or (2) obviously the right thing happens.
-     If (3), then since n is composite it is >= d^2. */
+  if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
+    return false;

-  d = 2;
-  do
+  uintmax_t sqrt_n = isqrt2 (n1, n0);
+
+  if (n0 == sqrt_n * sqrt_n)
     {
-      q = n / d;
-      while (n == q * d)
+      uintmax_t p1, p0;
+
+      umul_ppmm (p1, p0, sqrt_n, sqrt_n);
+      assert (p0 == n0);
+
+      if (n1 == p1)
         {
-          assert (n_factors < max_n_factors);
-          factors[n_factors++] = d;
-          n = q;
-          q = n / d;
+          if (prime_p (sqrt_n))
+            factor_insert_multiplicity (factors, sqrt_n, 2);
+          else
+            {
+              struct factors f;
+
+              f.nfactors = 0;
+              if (!factor_using_squfof (0, sqrt_n, &f))
+                {
+                  /* Try pollard rho instead */
+                  factor_using_pollard_rho (sqrt_n, 1, &f);
+                }
+              /* Duplicate the new factors */
+              for (unsigned int i = 0; i < f.nfactors; i++)
+                factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
+            }
+          return true;
         }
-      d += *(w++);
-      if (w == WHEEL_END)
-        w = WHEEL_START;
     }
-  while (d <= q);

-  if (n != 1 || n0 == 1)
+  /* Select multipliers so we always get n * mu = 3 (mod 4) */
+  for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
+       *m; m++)
     {
-      assert (n_factors < max_n_factors);
-      factors[n_factors++] = n;
-    }
+      uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
+      unsigned int i;
+      unsigned int mu = *m;
+      unsigned int qpos = 0;
+
+      assert (mu * n0 % 4 == 3);
+
+      /* In the notation of the paper, with mu * n == 3 (mod 4), we
+         get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
+         I understand it, the necessary bound is 4 \mu^3 < n, or 32
+         mu^3 < n.
+
+         However, this seems insufficient: With n = 37243139 and mu =
+         105, we get a trivial factor, from the square 38809 = 197^2,
+         without any corresponding Q earlier in the iteration.
+
+         Requiring 64 mu^3 < n seems sufficient. */
+      if (n1 == 0)
+        {
+          if ((uintmax_t) mu*mu*mu >= n0 / 64)
+            continue;
+        }
+      else
+        {
+          if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
+            continue;
+        }
+      umul_ppmm (Dh, Dl, n0, mu);
+      Dh += n1 * mu;
+
+      assert (Dl % 4 != 1);
+      assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
+
+      S = isqrt2 (Dh, Dl);
+
+      Q1 = 1;
+      P = S;
+
+      /* Square root remainder fits in one word, so ignore high part. */
+      Q = Dl - P*P;
+      /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
+      L = isqrt (2*S);
+      B = 2*L;
+      L1 = mu * 2 * L;
+
+      /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
+         4 D. */
+
+      for (i = 0; i <= B; i++)
+        {
+          uintmax_t q, P1, t, rem;
+
+          div_smallq (q, rem, S+P, Q);
+          P1 = S - rem; /* P1 = q*Q - P */
+
+#if STAT_SQUFOF
+          assert (q > 0);
+          q_freq[0]++;
+          q_freq[MIN (q, Q_FREQ_SIZE)]++;
+#endif
+
+          if (Q <= L1)
+            {
+              uintmax_t g = Q;
+
+              if ( (Q & 1) == 0)
+                g /= 2;
+
+              g /= gcd_odd (g, mu);

-  return n_factors;
+              if (g <= L)
+                {
+                  if (qpos >= QUEUE_SIZE)
+                    error (EXIT_FAILURE, 0, _("squfof queue overflow"));
+                  queue[qpos].Q = g;
+                  queue[qpos].P = P % g;
+                  qpos++;
+                }
+            }
+
+          /* I think the difference can be either sign, but mod
+             2^W_TYPE_SIZE arithmetic should be fine. */
+          t = Q1 + q * (P - P1);
+          Q1 = Q;
+          Q = t;
+          P = P1;
+
+          if ( (i & 1) == 0)
+            {
+              uintmax_t r = is_square (Q);
+              if (r)
+                {
+                  for (unsigned int j = 0; j < qpos; j++)
+                    {
+                      if (queue[j].Q == r)
+                        {
+                          if (r == 1)
+                            /* Traversed entire cycle. */
+                            goto next_multiplier;
+
+                          /* Need the absolute value for divisibility test. */
+                          if (P >= queue[j].P)
+                            t = P - queue[j].P;
+                          else
+                            t = queue[j].P - P;
+                          if (t % r == 0)
+                            {
+                              /* Delete entries up to and including entry
+                                 j, which matched. */
+                              memmove (queue, queue + j + 1,
+                                       (qpos - j - 1) * sizeof (queue[0]));
+                              qpos -= (j + 1);
+                            }
+                          goto next_i;
+                        }
+                    }
+
+                  /* We have found a square form, which should give a
+                     factor. */
+                  Q1 = r;
+                  assert (S >= P); /* What signs are possible? */
+                  P += r * ((S - P) / r);
+
+                  /* Note: Paper says (N - P*P) / Q1, that seems incorrect
+                     for the case D = 2N. */
+                  /* Compute Q = (D - P*P) / Q1, but we need double
+                     precision. */
+                  uintmax_t hi, lo;
+                  umul_ppmm (hi, lo, P, P);
+                  sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
+                  udiv_qrnnd (Q, rem, hi, lo, Q1);
+                  assert (rem == 0);
+
+                  for (;;)
+                    {
+                      /* Note: There appears to by a typo in the paper,
+                         Step 4a in the algorithm description says q <--
+                         floor([S+P]/\hat Q), but looking at the equations
+                         in Sec. 3.1, it should be q <-- floor([S+P] / Q).
+                         (In this code, \hat Q is Q1). */
+                      div_smallq (q, rem, S+P, Q);
+                      P1 = S - rem;     /* P1 = q*Q - P */
+
+#if STAT_SQUFOF
+                      q_freq[0]++;
+                      q_freq[MIN (q, Q_FREQ_SIZE)]++;
+#endif
+                      if (P == P1)
+                        break;
+                      t = Q1 + q * (P - P1);
+                      Q1 = Q;
+                      Q = t;
+                      P = P1;
+                    }
+
+                  if ( (Q & 1) == 0)
+                    Q /= 2;
+                  Q /= gcd_odd (Q, mu);
+
+                  assert (Q > 1 && (n1 || Q < n0));
+
+                  if (prime_p (Q))
+                    factor_insert (factors, Q);
+                  else if (!factor_using_squfof (0, Q, factors))
+                    factor_using_pollard_rho (Q, 2, factors);
+
+                  divexact_21 (n1, n0, n1, n0, Q);
+
+                  if (prime2_p (n1, n0))
+                    factor_insert_large (factors, n1, n0);
+                  else
+                    {
+                      if (!factor_using_squfof (n1, n0, factors))
+                        {
+                          if (n1 == 0)
+                            factor_using_pollard_rho (n0, 1, factors);
+                          else
+                            factor_using_pollard_rho2 (n1, n0, 1, factors);
+                        }
+                    }
+
+                  return true;
+                }
+            }
+        next_i:;
+        }
+    next_multiplier:;
+    }
+  return false;
 }

-/* Single-precision factoring */
 static void
-print_factors_single (uintmax_t n)
+factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
 {
-  uintmax_t factors[MAX_N_FACTORS];
-  size_t n_factors = factor_wheel (n, MAX_N_FACTORS, factors);
-  size_t i;
-  char buf[INT_BUFSIZE_BOUND (uintmax_t)];
+  factors->nfactors = 0;
+  factors->plarge[1] = 0;

-  printf ("%s:", umaxtostr (n, buf));
-  for (i = 0; i < n_factors; i++)
-    printf (" %s", umaxtostr (factors[i], buf));
-  putchar ('\n');
-}
+  if (t1 == 0 && t0 < 2)
+    return;

-#if HAVE_GMP
-static int
-mpcompare (const void *av, const void *bv)
-{
-  mpz_t *const *a = av;
-  mpz_t *const *b = bv;
-  return mpz_cmp (**a, **b);
+  t0 = factor_using_division (&t1, t1, t0, factors);
+
+  if (t1 == 0 && t0 < 2)
+    return;
+
+  if (prime2_p (t1, t0))
+    factor_insert_large (factors, t1, t0);
+  else
+    {
+      if (alg == ALG_SQUFOF)
+        if (factor_using_squfof (t1, t0, factors))
+          return;
+
+      if (t1 == 0)
+        factor_using_pollard_rho (t0, 1, factors);
+      else
+        factor_using_pollard_rho2 (t1, t0, 1, factors);
+    }
 }

+#if HAVE_GMP
 static void
-sort_and_print_factors (void)
+mp_factor (mpz_t t, struct mp_factors *factors)
 {
-  mpz_t **faclist;
-  size_t i;
+  mp_factor_init (factors);

-  faclist = xcalloc (nfactors_found, sizeof *faclist);
-  for (i = 0; i < nfactors_found; ++i)
+  if (mpz_sgn (t) != 0)
     {
-      faclist[i] = &factor[i];
+      mp_factor_using_division (t, factors);
+
+      if (mpz_cmp_ui (t, 1) != 0)
+        {
+          debug ("[is number prime?] ");
+          if (mp_prime_p (t))
+            mp_factor_insert (factors, t);
+          else
+            mp_factor_using_pollard_rho (t, 1, factors);
+        }
     }
-  qsort (faclist, nfactors_found, sizeof *faclist, mpcompare);
+}
+#endif
+
+static strtol_error
+strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
+{
+  unsigned int lo_carry;
+  uintmax_t hi, lo;

-  for (i = 0; i < nfactors_found; ++i)
+  strtol_error err;
+
+  hi = lo = 0;
+  for (;;)
     {
-      fputc (' ', stdout);
-      mpz_out_str (stdout, 10, *faclist[i]);
+      unsigned int c = *s++;
+      if (c == 0)
+        break;
+
+      if (UNLIKELY (!ISDIGIT (c)))
+        {
+          err = LONGINT_INVALID;
+          break;
+        }
+      c -= '0';
+
+      err = LONGINT_OK;           /* we've seen at least one valid digit */
+
+      if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
+        {
+          err = LONGINT_OVERFLOW;
+          break;
+        }
+      hi = 10 * hi;
+
+      lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
+      lo_carry += 10 * lo < 2 * lo;
+
+      lo = 10 * lo;
+      lo += c;
+
+      lo_carry += lo < c;
+      hi += lo_carry;
+      if (UNLIKELY (hi < lo_carry))
+        {
+          err = LONGINT_OVERFLOW;
+          break;
+        }
     }
-  putchar ('\n');
-  free (faclist);
+
+  *hip = hi;
+  *lop = lo;
+
+  return err;
 }

 static void
-free_factors (void)
+print_uintmaxes (uintmax_t t1, uintmax_t t0)
 {
-  size_t i;
+  uintmax_t q, r;

-  for (i = 0; i < nfactors_found; ++i)
+  if (t1 == 0)
+    printf ("%ju", t0);
+  else
     {
-      mpz_clear (factor[i]);
+      /* Use very plain code here since it seems hard to write fast code
+         without assuming a specific word size.  */
+      q = t1 / 1000000000;
+      r = t1 % 1000000000;
+      udiv_qrnnd (t0, r, r, t0, 1000000000);
+      print_uintmaxes (q, t0);
+      printf ("%09u", (int) r);
     }
-  /* Don't actually free factor[] because in the case where we are
-     reading numbers from stdin, we may be about to use it again.  */
-  nfactors_found = 0;
 }

-/* Arbitrary-precision factoring */
+/* Single-precision factoring */
 static void
-print_factors_multi (mpz_t t)
+print_factors_single (uintmax_t t1, uintmax_t t0)
 {
-  mpz_out_str (stdout, 10, t);
+  struct factors factors;
+
+  print_uintmaxes (t1, t0);
   putchar (':');

-  if (mpz_sgn (t) != 0)
-    {
-      /* Set the trial division limit according to the size of t.  */
-      size_t n_bits = mpz_sizeinbase (t, 2);
-      unsigned int division_limit = MIN (n_bits, 1000);
-      division_limit *= division_limit;
+  factor (t1, t0, &factors);

-      factor_using_division (t, division_limit);
+  for (unsigned int j = 0; j < factors.nfactors; j++)
+    for (unsigned int k = 0; k < factors.e[j]; k++)
+      printf (" %ju", factors.p[j]);

-      if (mpz_cmp_ui (t, 1) != 0)
-        {
-          debug ("[is number prime?] ");
-          if (mpz_probab_prime_p (t, MR_REPS))
-            emit_factor (t);
-          else
-            factor_using_pollard_rho (t, 1);
-        }
+  if (factors.plarge[1])
+    {
+      putchar (' ');
+      print_uintmaxes (factors.plarge[1], factors.plarge[0]);
     }
-
-  mpz_clear (t);
-  sort_and_print_factors ();
-  free_factors ();
+  putchar ('\n');
 }
-#endif
-

 /* Emit the factors of the indicated number.  If we have the option of using
    either algorithm, we select on the basis of the length of the number.
@@ -434,58 +2288,55 @@ print_factors_multi (mpz_t t)
    has enough digits, because the algorithm is better.  The turnover point
    depends on the value.  */
 static bool
-print_factors (char const *s)
+print_factors (const char *input)
 {
-  uintmax_t n;
-  strtol_error err = xstrtoumax (s, NULL, 10, &n, "");
+  uintmax_t t1, t0;

-#if HAVE_GMP
-  enum { GMP_TURNOVER_POINT = 100000 };
+  /* Try converting the number to one or two words.  If it fails, use GMP or
+     print an error message.  The 2nd condition checks that the most
+     significant bit of the two-word number is clear, in a typesize neutral
+     way.  */
+  strtol_error err = strto2uintmax (&t1, &t0, input);

-  if (err == LONGINT_OVERFLOW
-      || (err == LONGINT_OK && GMP_TURNOVER_POINT <= n))
+  switch (err)
     {
-      mpz_t t;
-      mpz_init (t);
-      if (gmp_sscanf (s, "%Zd", t) == 1)
+    case LONGINT_OK:
+      if (((t1 << 1) >> 1) == t1)
         {
-          debug ("[%s]", _("using arbitrary-precision arithmetic"));
-          print_factors_multi (t);
+          debug ("[%s]", _("using single-precision arithmetic"));
+          print_factors_single (t1, t0);
           return true;
         }
-      err = LONGINT_INVALID;
-    }
-#endif
-
-  switch (err)
-    {
-    case LONGINT_OK:
-      debug ("[%s]", _("using single-precision arithmetic"));
-      print_factors_single (n);
-      return true;
-
-    case LONGINT_OVERFLOW:
-      error (0, 0, _("%s is too large"), quote (s));
-      return false;
+      break;

     default:
-      error (0, 0, _("%s is not a valid positive integer"), quote (s));
+      error (0, 0, _("%s is not a valid positive integer"), quote (input));
       return false;
     }
-}

-enum
-{
-  VERBOSE_OPTION = CHAR_MAX + 1
-};
+#if HAVE_GMP
+  debug ("[%s]", _("using arbitrary-precision arithmetic"));
+  mpz_t t;
+  struct mp_factors factors;

-static struct option const long_options[] =
-{
-  {"verbose", no_argument, NULL, VERBOSE_OPTION},
-  {GETOPT_HELP_OPTION_DECL},
-  {GETOPT_VERSION_OPTION_DECL},
-  {NULL, 0, NULL, 0}
-};
+  mpz_init_set_str (t, input, 10);
+
+  gmp_printf ("%Zd:", t);
+  mp_factor (t, &factors);
+
+  for (unsigned int j = 0; j < factors.nfactors; j++)
+    for (unsigned int k = 0; k < factors.e[j]; k++)
+      gmp_printf (" %Zd", factors.p[j]);
+
+  mp_factor_clear (&factors);
+  mpz_clear (t);
+  putchar ('\n');
+  return true;
+#else
+  error (0, 0, _("%s is too large"), quote (input));
+  return false;
+#endif
+}

 void
 usage (int status)
@@ -535,9 +2386,6 @@ do_stdin (void)
 int
 main (int argc, char **argv)
 {
-  bool ok;
-  int c;
-
   initialize_main (&argc, &argv);
   set_program_name (argv[0]);
   setlocale (LC_ALL, "");
@@ -546,12 +2394,23 @@ main (int argc, char **argv)

   atexit (close_stdout);

+  alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
+
+  int c;
   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
     {
       switch (c)
         {
         case VERBOSE_OPTION:
-          verbose = true;
+          verbose = 1;
+          break;
+
+        case 's':
+          alg = ALG_SQUFOF;
+          break;
+
+        case 'w':
+          flag_prove_primality = false;
           break;

         case_GETOPT_HELP_CHAR;
@@ -563,18 +2422,36 @@ main (int argc, char **argv)
         }
     }

+#if STAT_SQUFOF
+  if (alg == ALG_SQUFOF)
+    memset (q_freq, 0, sizeof (q_freq));
+#endif
+
+  bool ok;
   if (argc <= optind)
     ok = do_stdin ();
   else
     {
-      int i;
       ok = true;
-      for (i = optind; i < argc; i++)
+      for (int i = optind; i < argc; i++)
         if (! print_factors (argv[i]))
           ok = false;
     }
-#if HAVE_GMP
-  free (factor);
+
+#if STAT_SQUFOF
+  if (alg == ALG_SQUFOF && q_freq[0] > 0)
+    {
+      double acc_f;
+      printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
+      for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
+        {
+          double f = (double) q_freq[i] / q_freq[0];
+          acc_f += f;
+          printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
+                  100.0 * f, 100.0 * acc_f);
+        }
+    }
 #endif
+
   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }
diff --git a/src/local.mk b/src/local.mk
index 98259fa..6a01ef1 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -35,7 +35,10 @@ bin_PROGRAMS = @bin_PROGRAMS@
 pkglibexec_PROGRAMS = @pkglibexec_PROGRAMS@

 # Needed by the testsuite.
-noinst_PROGRAMS = src/setuidgid src/getlimits
+noinst_PROGRAMS =              \
+  src/getlimits                        \
+  src/make-prime-list          \
+  src/setuidgid

 noinst_HEADERS =               \
   src/chown-core.h             \
@@ -48,20 +51,18 @@ noinst_HEADERS =            \
   src/fs-is-local.h            \
   src/group-list.h             \
   src/ioblksize.h              \
+  src/longlong.h               \
   src/ls.h                     \
   src/operand2sig.h            \
   src/prog-fprintf.h           \
   src/remove.h                 \
   src/system.h                 \
-  src/wheel-size.h             \
-  src/wheel.h                  \
   src/uname.h

 EXTRA_DIST +=          \
   src/dcgen            \
   src/dircolors.hin    \
   src/tac-pipe.c       \
-  src/wheel-gen.pl     \
   src/extract-magic    \
   src/c99-to-c89.diff

@@ -134,6 +135,12 @@ src_link_LDADD = $(LDADD)
 src_ln_LDADD = $(LDADD)
 src_logname_LDADD = $(LDADD)
 src_ls_LDADD = $(LDADD)
+
+# This must *not* depend on anything in lib/, since it is used to generate
+# src/primes.h.  If it depended on libcoreutils.a, that would pull all lib/*.c
+# into BUILT_SOURCES.
+src_make_prime_list_LDADD =
+
 src_md5sum_LDADD = $(LDADD)
 src_mkdir_LDADD = $(LDADD)
 src_mkfifo_LDADD = $(LDADD)
@@ -371,19 +378,11 @@ src/dircolors.h: src/dcgen src/dircolors.hin
        $(AM_V_at)chmod a-w $@-t
        $(AM_V_at)mv $@-t $@

-wheel_size = 5
-
-BUILT_SOURCES += src/wheel-size.h
-src/wheel-size.h: Makefile.am
-       $(AM_V_GEN)rm -f $@ $@-t
-       $(AM_V_at)echo '#define WHEEL_SIZE $(wheel_size)' > $@-t
-       $(AM_V_at)chmod a-w $@-t
-       $(AM_V_at)mv $@-t $@
-
-BUILT_SOURCES += src/wheel.h
-src/wheel.h: src/wheel-gen.pl Makefile.am
+BUILT_SOURCES += src/primes.h
+CLEANFILES += src/primes.h
+src/primes.h: src/make-prime-list
        $(AM_V_GEN)rm -f $@ $@-t
-       $(AM_V_at)$(srcdir)/src/wheel-gen.pl $(wheel_size) > $@-t
+       $(AM_V_at)src/make-prime-list 5000 > $@-t
        $(AM_V_at)chmod a-w $@-t
        $(AM_V_at)mv $@-t $@

diff --git a/src/longlong.h b/src/longlong.h
index 84e32ee..510f40e 100644
--- a/src/longlong.h
+++ b/src/longlong.h
@@ -1,7 +1,7 @@
 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.

 Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003,
-2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc.

 This file is free software; you can redistribute it and/or modify it under the
 terms of the GNU Lesser General Public License as published by the Free
@@ -2055,6 +2055,10 @@ extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, 
UWtype *);
 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
 #endif

+#ifndef __GMP_DECLSPEC
+#define __GMP_DECLSPEC /* empty */
+#endif
+
 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
 #if !defined (udiv_qrnnd)
 #define UDIV_NEEDS_NORMALIZATION 1
diff --git a/src/wheel-gen.pl b/src/wheel-gen.pl
deleted file mode 100755
index 65b60d1..0000000
--- a/src/wheel-gen.pl
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-# Generate the spokes of a wheel, for wheel factorization.
-
-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
-  if 0;
-
-use strict;
-(my $ME = $0) =~ s|.*/||;
-
-# A global destructor to close standard output with error checking.
-sub END
-{
-  defined fileno STDOUT
-    or return;
-  close STDOUT
-    and return;
-  warn "$ME: closing standard output: $!\n";
-  $? ||= 1;
-}
-
-sub is_prime ($)
-{
-  my ($n) = @_;
-  use integer;
-
-  $n == 2
-    and return 1;
-
-  my $d = 2;
-  my $w = 1;
-  while (1)
-    {
-      my $q = $n / $d;
-      $n == $q * $d
-        and return 0;
-      $d += $w;
-      $q < $d
-        and last;
-      $w = 2;
-    }
-  return 1;
-}
-
-{
-  @ARGV == 1
-    or die "$ME: missing argument\n";
-
-  my $wheel_size = $ARGV[0];
-
-  my @primes = (2);
-  my $product = $primes[0];
-  my $n_primes = 1;
-  for (my $i = 3; ; $i += 2)
-    {
-      if (is_prime $i)
-        {
-          push @primes, $i;
-          $product *= $i;
-          ++$n_primes == $wheel_size
-            and last;
-        }
-    }
-
-  my $ws_m1 = $wheel_size - 1;
-  print <<EOF;
-/* The first $ws_m1 elements correspond to the incremental offsets of the
-   first $wheel_size primes (@primes).  The $wheel_size(th) element is the
-   difference between that last prime and the next largest integer
-   that is not a multiple of those primes.  The remaining numbers
-   define the wheel.  For more information, see
-   http://www.utm.edu/research/primes/glossary/WheelFactorization.html.  */
-EOF
-
-  my @increments;
-  my $prev = 2;
-  for (my $i = 3; ; $i += 2)
-    {
-      my $rel_prime = 1;
-      foreach my $divisor (@primes)
-        {
-          $i != $divisor && $i % $divisor == 0
-            and $rel_prime = 0;
-        }
-
-      if ($rel_prime)
-        {
-          #warn $i, ' ', $i - $prev, "\n";
-          push @increments, $i - $prev;
-          $prev = $i;
-
-          $product + 1 < $i
-            and last;
-        }
-    }
-
-  print join (",\n", @increments), "\n";
-
-  exit 0;
-}
diff --git a/tests/factor/run.sh b/tests/factor/run.sh
new file mode 100755
index 0000000..6ff24c3
--- /dev/null
+++ b/tests/factor/run.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+# Test the factor rewrite.
+# Expect to be invoked via a file whose basename matches
+# /^(\d+)\-(\d+)\-([\da-f]{40})\.sh$/
+# The test is to run this command
+# seq $1 $2 | factor | shasum -c --status <(echo $3  -)
+# I.e., to ensure that the factorizations of integers $1..$2
+# match what we expect.
+
+# Copyright (C) 2012 Free Software Foundation, Inc.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+
+# Don't run these tests by default.
+very_expensive_
+
+print_ver_ factor seq
+
+# Remove the ".sh" suffix:
+t=${ME_%.sh}
+
+# Make IFS include "-", so that a simple "set" will separate the args:
+IFS=-$IFS
+set $t
+echo "$3  -" > exp
+
+f=1
+seq $1 $2 | factor | shasum -c --status exp && f=0
+
+Exit $f
diff --git a/tests/local.mk b/tests/local.mk
index 0b6d576..486bf31 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -16,9 +16,9 @@
 ## along with this program.  If not, see <http://www.gnu.org/licenses/>.

 # Indirections required so that we'll still be able to know the
-# complete list of our tests even if the user override TESTS from the
-# command line (which he's allowed to do by the test harness API).
-TESTS = $(all_tests)
+# complete list of our tests even if the user overrides TESTS
+# from the command line (as permitted by the test harness API).
+TESTS = $(all_tests) $(factor_tests)
 root_tests = $(all_root_tests)

 EXTRA_DIST += $(all_tests)
@@ -89,14 +89,15 @@ TESTS_ENVIRONMENT =                         \
 VERBOSE = yes

 EXTRA_DIST +=                  \
+  init.cfg                     \
   tests/Coreutils.pm           \
   tests/CuSkip.pm              \
   tests/CuTmpdir.pm            \
   tests/d_type-check           \
   tests/envvar-check           \
+  tests/factor/run.sh          \
   tests/filefrag-extent-compare \
   tests/fiemap-capable         \
-  init.cfg                     \
   tests/init.sh                        \
   tests/lang-default           \
   tests/no-perl                        \
@@ -624,6 +625,68 @@ all_tests =                                        \
   tests/touch/trailing-slash.sh                        \
   $(all_root_tests)

+# prefix of 2^64
+p = 184467440737
+# prefix of 2^96
+q = 79228162514264337593543
+
+# Each of these numbers has a Pollard rho factor larger than 2^64,
+# and thus exercises some hard-to-reach code in factor.c.
+t1 = 170141183460469225450570946617781744489
+t2 = 170141183460469229545748130981302223887
+# Factors of the above:
+# t1: 9223372036854775421 18446744073709551709
+# t2: 9223372036854775643 18446744073709551709
+
+# Each tests is a triple: lo, hi, sha1 of result.
+# The test script, run.sh, runs seq lo hi|factor|sha1sum
+# and verifies that the actual and expected checksums are the same.
+tf = tests/factor
+factor_tests = \
+  $(tf)/0-10000000-a451244522b1b662c86cb3cbb55aee3e085a61a0.sh \
+  $(tf)/10000000-20000000-c792a2e02f1c8536b5121f624b04039d20187016.sh \
+  $(tf)/20000000-30000000-8115e8dff97d1674134ec054598d939a2a5f6113.sh \
+  $(tf)/30000000-40000000-fe7b832c8e0ed55035152c0f9ebd59de73224a60.sh \
+  $(tf)/40000000-50000000-b8786d66c432e48bc5b342ee3c6752b7f096f206.sh \
+  $(tf)/50000000-60000000-a74fe518c5f79873c2b9016745b88b42c8fd3ede.sh \
+  $(tf)/60000000-70000000-689bc70d681791e5d1b8ac1316a05d0c4473d6db.sh \
+  $(tf)/70000000-80000000-d370808f2ab8c865f64c2ff909c5722db5b7d58d.sh \
+  $(tf)/80000000-90000000-7978aa66bf2bdb446398336ea6f02605e9a77581.sh \
+  $(tf)/$(t1)-$(t1)-4622287c5f040cdb7b3bbe4d19d29a71ab277827.sh \
+  $(tf)/$(t2)-$(t2)-dea308253708b57afad357e8c0d2a111460ef50e.sh \
+  $(tf)/$(p)08551616-$(p)08651615-66c57cd58f4fb572df7f088d17e4f4c1d4f01bb1.sh \
+  $(tf)/$(p)08651616-$(p)08751615-729228e693b1a568ecc85b199927424c7d16d410.sh \
+  $(tf)/$(p)08751616-$(p)08851615-5a0c985017c2d285e4698f836f5a059e0b684563.sh \
+  $(tf)/$(p)08851616-$(p)08951615-0482295c514e371c98ce9fd335deed0c9c44a4f4.sh \
+  $(tf)/$(p)08951616-$(p)09051615-9c0e1105ac7c45e27e7bbeb5e213f530d2ad1a71.sh \
+  $(tf)/$(p)09051616-$(p)09151615-604366d2b1d75371d0679e6a68962d66336cd383.sh \
+  $(tf)/$(p)09151616-$(p)09251615-9192d2bdee930135b28d7160e6d395a7027871da.sh \
+  $(tf)/$(p)09251616-$(p)09351615-bcf56ae55d20d700690cff4d3327b78f83fc01bf.sh \
+  $(tf)/$(p)09351616-$(p)09451615-16b106398749e5f24d278ba7c58229ae43f650ac.sh \
+  $(tf)/$(p)09451616-$(p)09551615-ad2c6ed63525f8e7c83c4c416e7715fa1bebc54c.sh \
+  $(tf)/$(p)09551616-$(p)09651615-2b6f9c11742d9de045515a6627c27a042c49f8ba.sh \
+  $(tf)/$(p)09651616-$(p)09751615-54851acd51c4819beb666e26bc0100dc9adbc310.sh \
+  $(tf)/$(p)09751616-$(p)09851615-6939c2a7afd2d81f45f818a159b7c5226f83a50b.sh \
+  $(tf)/$(p)09851616-$(p)09951615-0f2c8bc011d2a45e2afa01459391e68873363c6c.sh \
+  $(tf)/$(p)09951616-$(p)10051615-630dc2ad72f4c222bad1405e6c5bea590f92a98c.sh \
+  $(tf)/$(q)940336-$(q)942335-63cbd6313d78247b04d63bbbac50cb8f8d33ff71.sh \
+  $(tf)/$(q)942336-$(q)944335-0d03d63653767173182491b86fa18f8f680bb036.sh \
+  $(tf)/$(q)944336-$(q)946335-ca43bd38cd9f97cc5bb63613cb19643578640f0b.sh \
+  $(tf)/$(q)946336-$(q)948335-86d59545a0c13567fa96811821ea5cde950611b1.sh \
+  $(tf)/$(q)948336-$(q)950335-c3740e702fa9c97e6cf00150860e0b936a141a6b.sh \
+  $(tf)/$(q)950336-$(q)952335-551c3c4c4640d86fda311b5c3006dac45505c0ce.sh \
+  $(tf)/$(q)952336-$(q)954335-b1b0b00463c2f853d70ef9c4f7a96de5cb614156.sh \
+  $(tf)/$(q)954336-$(q)956335-8938a484a9ef6bb16478091d294fcde9f8ecea69.sh \
+  $(tf)/$(q)956336-$(q)958335-d1ae6bc712d994f35edf55c785d71ddf31f16535.sh \
+  $(tf)/$(q)958336-$(q)960335-2374919a89196e1fce93adfe779cb4664556d4b6.sh \
+  $(tf)/$(q)960336-$(q)962335-569e4363e8d9e8830a187d9ab27365eef08abde1.sh
+
+$(factor_tests): tests/factor/run.sh
+       $(AM_V_GEN)$(MKDIR_P) $(tf)
+       $(AM_V_at)ln -f $(srcdir)/tests/factor/run.sh $@
+
+CLEANFILES += $(factor_tests)
+
 pr_data =                                      \
   tests/pr/0F                                  \
   tests/pr/0FF                                 \
-- 
1.8.0.rc0.18.gf84667d


>From 06b7ed8051d07677572e074d3d45007f1282ed56 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Tue, 18 Sep 2012 08:26:17 +0200
Subject: [PATCH 8/8] factor: 25% speed-up, on output

* src/factor.c (print_factors_single): Use fputs and umaxtostr
rather than printf with "%ju".  This reduced the time required
to compute and print the factors of the first 10^7 integers from
over 8 seconds to 5.75s.  Run this command:
seq $((10**7)) | env time factor > /dev/null
---
 src/factor.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/factor.c b/src/factor.c
index 84d9721..1ca5812 100644
--- a/src/factor.c
+++ b/src/factor.c
@@ -2272,7 +2272,11 @@ print_factors_single (uintmax_t t1, uintmax_t t0)

   for (unsigned int j = 0; j < factors.nfactors; j++)
     for (unsigned int k = 0; k < factors.e[j]; k++)
-      printf (" %ju", factors.p[j]);
+      {
+        char buf[INT_BUFSIZE_BOUND (uintmax_t)];
+        putchar (' ');
+        fputs (umaxtostr (factors.p[j], buf), stdout);
+      }

   if (factors.plarge[1])
     {
-- 
1.8.0.rc0.18.gf84667d


reply via email to

[Prev in Thread] Current Thread [Next in Thread]