From c08eb6f586c3102235abbcb85fe8980e0fefe2f8 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Thu, 12 Aug 2021 10:56:38 +0200 Subject: [PATCH 2/2] rng: Add RANLUX++ generator This generator is an LCG equivalent of RANLUX using 576 bit numbers. Compared to the implementations already available, it provides a much higher luxury level and better initialization routine that guarantees non-overlapping sequences. For more information see, * A. Sibidanov, "A revision of the subtract-with-borrow random number generators", Computer Physics Communications, 221(2017) 299--303. * J. Hahnfeld, L. Moneta, "A Portable Implementation of RANLUX++", vCHEP2021, preprint https://arxiv.org/pdf/2106.02504.pdf --- AUTHORS | 1 + doc/rng.rst | 17 +++ doc_texinfo/rng.texi | 19 +++ rng/Makefile.am | 2 +- rng/gsl_rng.h | 1 + rng/ranluxpp.c | 154 +++++++++++++++++++++ rng/ranluxpp/helpers.h | 175 ++++++++++++++++++++++++ rng/ranluxpp/mulmod.h | 273 ++++++++++++++++++++++++++++++++++++++ rng/ranluxpp/ranlux_lcg.h | 110 +++++++++++++++ rng/test.c | 6 + rng/types.c | 1 + 11 files changed, 758 insertions(+), 1 deletion(-) create mode 100644 rng/ranluxpp.c create mode 100644 rng/ranluxpp/helpers.h create mode 100644 rng/ranluxpp/mulmod.h create mode 100644 rng/ranluxpp/ranlux_lcg.h diff --git a/AUTHORS b/AUTHORS index 331875c5..9457110b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -30,3 +30,4 @@ Patrick Alken - nonsymmetric and generalized Rhys Ulerich (rhys.ulerich@gmail.com) - multisets Pavel Holoborodko - fixed order Gauss-Legendre quadrature Pedro Gonnet - CQUAD integration routines. +Jonas Hahnfeld - implementation of RANLUX++ diff --git a/doc/rng.rst b/doc/rng.rst index 8e595d5e..9338458a 100644 --- a/doc/rng.rst +++ b/doc/rng.rst @@ -471,6 +471,23 @@ randomness. pseudo-random number generator of Luscher", Computer Physics Communications, 79 (1994) 111--114 +.. index:: RANLUX++ random number generator + +.. var:: gsl_rng_type * gsl_rng_ranluxpp + + The :code:`ranluxpp` generator is an LCG equivalent of RANLUX using + 576 bit numbers. Compared to the implementations above, it provides + a much higher luxury level and better initialization routine that + guarantees non-overlapping sequences. + + For more information see, + + * A. Sibidanov, "A revision of the subtract-with-borrow random number + generators", Computer Physics Communications, 221(2017) 299--303. + + * J. Hahnfeld, L. Moneta, "A Portable Implementation of RANLUX++", + vCHEP2021, preprint https://arxiv.org/pdf/2106.02504.pdf + .. index:: single: CMRG, combined multiple recursive random number generator diff --git a/doc_texinfo/rng.texi b/doc_texinfo/rng.texi index d2168c09..c91a0008 100644 --- a/doc_texinfo/rng.texi +++ b/doc_texinfo/rng.texi @@ -505,6 +505,25 @@ Communications}, 79 (1994) 111--114 @end deffn +@deffn {Generator} gsl_rng_ranluxpp +@cindex RANLUX++ random number generator +The @code{ranluxpp} generator is an LCG equivalent of RANLUX using +576 bit numbers. Compared to the implementations above, it provides +a much higher luxury level and better initialization routine that +guarantees non-overlapping sequences. + +For more information see, +@itemize @w{} +@item +A. Sibidanov, ``A revision of the subtract-with-borrow random number +generators'', @cite{Computer Physics Communications}, 221(2017), 299--303. +@item +J. Hahnfeld, L. Moneta, ``A Portable Implementation of RANLUX++'', +vCHEP2021, preprint @uref{https://arxiv.org/pdf/2106.02504.pdf}. +@end itemize +@end deffn + + @deffn {Generator} gsl_rng_cmrg @cindex CMRG, combined multiple recursive random number generator This is a combined multiple recursive generator by L'Ecuyer. diff --git a/rng/Makefile.am b/rng/Makefile.am index b0c6b8cc..6411d719 100644 --- a/rng/Makefile.am +++ b/rng/Makefile.am @@ -4,7 +4,7 @@ pkginclude_HEADERS = gsl_rng.h AM_CPPFLAGS = -I$(top_srcdir) -libgslrng_la_SOURCES = borosh13.c cmrg.c coveyou.c default.c file.c fishman18.c fishman20.c fishman2x.c gfsr4.c knuthran2.c knuthran.c knuthran2002.c lecuyer21.c minstd.c mrg.c mt.c r250.c ran0.c ran1.c ran2.c ran3.c rand48.c rand.c random.c randu.c ranf.c ranlux.c ranlxd.c ranlxs.c ranmar.c rng.c slatec.c taus.c taus113.c transputer.c tt.c types.c uni32.c uni.c vax.c waterman14.c zuf.c inline.c +libgslrng_la_SOURCES = borosh13.c cmrg.c coveyou.c default.c file.c fishman18.c fishman20.c fishman2x.c gfsr4.c knuthran2.c knuthran.c knuthran2002.c lecuyer21.c minstd.c mrg.c mt.c r250.c ran0.c ran1.c ran2.c ran3.c rand48.c rand.c random.c randu.c ranf.c ranlux.c ranluxpp.c ranlxd.c ranlxs.c ranmar.c rng.c slatec.c taus.c taus113.c transputer.c tt.c types.c uni32.c uni.c vax.c waterman14.c zuf.c inline.c CLEANFILES = test.dat diff --git a/rng/gsl_rng.h b/rng/gsl_rng.h index 4ec55911..f4ee5997 100644 --- a/rng/gsl_rng.h +++ b/rng/gsl_rng.h @@ -109,6 +109,7 @@ GSL_VAR const gsl_rng_type *gsl_rng_ranlxd2; GSL_VAR const gsl_rng_type *gsl_rng_ranlxs0; GSL_VAR const gsl_rng_type *gsl_rng_ranlxs1; GSL_VAR const gsl_rng_type *gsl_rng_ranlxs2; +GSL_VAR const gsl_rng_type *gsl_rng_ranluxpp; GSL_VAR const gsl_rng_type *gsl_rng_ranmar; GSL_VAR const gsl_rng_type *gsl_rng_slatec; GSL_VAR const gsl_rng_type *gsl_rng_taus; diff --git a/rng/ranluxpp.c b/rng/ranluxpp.c new file mode 100644 index 00000000..797d90f1 --- /dev/null +++ b/rng/ranluxpp.c @@ -0,0 +1,154 @@ +/* rng/ranluxpp.c + * + * Copyright (C) 2020, 2021 Jonas Hahnfeld + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "ranluxpp/mulmod.h" +#include "ranluxpp/ranlux_lcg.h" + +#include + +#include +#include + +/* RANLUX++ is an LCG equivalent of RANLUX using 576 bit numbers. + + The idea of the generator (such as the initialization method) and the algorithm + for the modulo operation are described in + A. Sibidanov, *A revision of the subtract-with-borrow random number generators*, + *Computer Physics Communications*, 221(2017), 299-303, + preprint https://arxiv.org/pdf/1705.03123.pdf + + The code is loosely based on the Assembly implementation by A. Sibidanov + available at https://github.com/sibidanov/ranluxpp/. + + Compared to the original generator, this implementation contains a fix to ensure + that the modulo operation of the LCG always returns the smallest value congruent + to the modulus (based on notes by M. Lüscher). Also, the generator converts the + LCG state back to RANLUX numbers (implementation based on notes by M. Lüscher). + This avoids a bias in the generated numbers because the upper bits of the LCG + state, that is smaller than the modulus \f$ m = 2^{576} - 2^{240} + 1 \f$ (not + a power of 2!), have a higher probability of being 0 than 1. And finally, this + implementation draws 48 random bits for each generated floating point number + (instead of 52 bits as in the original generator) to maintain the theoretical + properties from understanding the original transition function of RANLUX as a + chaotic dynamical system. + + These modifications and the portable implementation in general are described in + J. Hahnfeld, L. Moneta, *A Portable Implementation of RANLUX++*, vCHEP2021 + preprint https://arxiv.org/pdf/2106.02504.pdf */ + +static const uint64_t kA_2048[] = { + 0xed7faa90747aaad9, 0x4cec2c78af55c101, 0xe64dcb31c48228ec, + 0x6d8a15a13bee7cb0, 0x20b2ca60cb78c509, 0x256c3d3c662ea36c, + 0xff74e54107684ed2, 0x492edfcc0cc8e753, 0xb48c187cf5b22097, +}; + +static const int kMaxPos = 9 * 64; +static const int kBits = 48; + +typedef struct +{ + uint64_t state[9]; ///< RANLUX state of the generator + unsigned carry; ///< Carry bit of the RANLUX state + int position; ///< Current position in bits +} ranluxpp_state; + +static void +ranluxpp_set (void *vstate, unsigned long int s) +{ + ranluxpp_state *state = (ranluxpp_state *) vstate; + uint64_t lcg[9]; + lcg[0] = 1; + for (int i = 1; i < 9; i++) + { + lcg[i] = 0; + } + + uint64_t a_seed[9]; + // Skip 2 ** 96 states. + powermod (kA_2048, a_seed, ((uint64_t) 1) << 48); + powermod (a_seed, a_seed, ((uint64_t) 1) << 48); + // Skip another s states. + powermod (a_seed, a_seed, s); + mulmod (a_seed, lcg); + + to_ranlux (lcg, state->state, &state->carry); + state->position = 0; +} + +static void +ranluxpp_advance (ranluxpp_state * state) +{ + uint64_t lcg[9]; + to_lcg (state->state, state->carry, lcg); + mulmod (kA_2048, lcg); + to_ranlux (lcg, state->state, &state->carry); + state->position = 0; +} + +static uint64_t +ranluxpp_next (ranluxpp_state * state) +{ + if (state->position + kBits > kMaxPos) + { + ranluxpp_advance (state); + } + + int idx = state->position / 64; + int offset = state->position % 64; + int numBits = 64 - offset; + + uint64_t bits = state->state[idx] >> offset; + if (numBits < kBits) + { + bits |= state->state[idx + 1] << numBits; + } + bits &= ((((uint64_t) 1) << kBits) - 1); + + state->position += kBits; + assert (state->position <= kMaxPos && "position out of range!"); + + return bits; +} + +static unsigned long int +ranluxpp_get (void *vstate) +{ + return ranluxpp_next ((ranluxpp_state *) vstate); +} + +static double +ranluxpp_get_double (void *vstate) +{ + static const double div = 1.0 / (((uint64_t) 1) << kBits); + uint64_t bits = ranluxpp_next ((ranluxpp_state *) vstate); + return bits * div; +} + +static const gsl_rng_type ranluxpp_type = { + "RANLUX++", + /*max= */ (((uint64_t) 1) << kBits) - 1, + /*min= */ 0, + /*size= */ sizeof (ranluxpp_state), + &ranluxpp_set, + &ranluxpp_get, + &ranluxpp_get_double, +}; + +// The only symbol exported from this translation unit. +const gsl_rng_type *gsl_rng_ranluxpp = &ranluxpp_type; diff --git a/rng/ranluxpp/helpers.h b/rng/ranluxpp/helpers.h new file mode 100644 index 00000000..f4903026 --- /dev/null +++ b/rng/ranluxpp/helpers.h @@ -0,0 +1,175 @@ +/* rng/ranluxpp/helpers.h + * + * Copyright (C) 2020, 2021 Jonas Hahnfeld + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef RANLUXPP_HELPERS_H +#define RANLUXPP_HELPERS_H + +#include + +/// Compute `a + b` and set `overflow` accordingly. +static inline uint64_t +add_overflow (uint64_t a, uint64_t b, unsigned *overflow) +{ + uint64_t add = a + b; + *overflow = (add < a); + return add; +} + +/// Compute `a + b` and increment `carry` if there was an overflow +static inline uint64_t +add_carry (uint64_t a, uint64_t b, unsigned *carry) +{ + unsigned overflow; + uint64_t add = add_overflow (a, b, &overflow); + // Do NOT branch on overflow to avoid jumping code, just add 0 if there was + // no overflow. + *carry += overflow; + return add; +} + +/// Compute `a - b` and set `overflow` accordingly +static inline uint64_t +sub_overflow (uint64_t a, uint64_t b, unsigned *overflow) +{ + uint64_t sub = a - b; + *overflow = (sub > a); + return sub; +} + +/// Compute `a - b` and increment `carry` if there was an overflow +static inline uint64_t +sub_carry (uint64_t a, uint64_t b, unsigned *carry) +{ + unsigned overflow; + uint64_t sub = sub_overflow (a, b, &overflow); + // Do NOT branch on overflow to avoid jumping code, just add 0 if there was + // no overflow. + *carry += overflow; + return sub; +} + +/// Update r = r - (t1 + t2) + (t3 + t2) * b ** 10 +/// +/// This function also yields cbar = floor(r / m) as its return value (int64_t +/// because the value can be -1). With an initial value of r = t0, this can +/// be used for computing the remainder after division by m (see the function +/// mod_m in mulmod.h). The function to_ranlux passes r = 0 and uses only the +/// return value to obtain the decimal expansion after divison by m. +static inline int64_t +compute_r (const uint64_t * upper, uint64_t * r) +{ + // Subtract t1 (24 * 24 = 576 bits) + unsigned carry = 0; + for (int i = 0; i < 9; i++) + { + uint64_t r_i = r[i]; + r_i = sub_overflow (r_i, carry, &carry); + + uint64_t t1_i = upper[i]; + r_i = sub_carry (r_i, t1_i, &carry); + r[i] = r_i; + } + int64_t c = -((int64_t) carry); + + // Subtract t2 (only 240 bits, so need to extend) + carry = 0; + for (int i = 0; i < 9; i++) + { + uint64_t r_i = r[i]; + r_i = sub_overflow (r_i, carry, &carry); + + uint64_t t2_bits = 0; + if (i < 4) + { + t2_bits += upper[i + 5] >> 16; + if (i < 3) + { + t2_bits += upper[i + 6] << 48; + } + } + r_i = sub_carry (r_i, t2_bits, &carry); + r[i] = r_i; + } + c -= carry; + + // r += (t3 + t2) * 2 ** 240 + carry = 0; + { + uint64_t r_3 = r[3]; + // 16 upper bits + uint64_t t2_bits = (upper[5] >> 16) << 48; + uint64_t t3_bits = (upper[0] << 48); + + r_3 = add_carry (r_3, t2_bits, &carry); + r_3 = add_carry (r_3, t3_bits, &carry); + + r[3] = r_3; + } + for (int i = 0; i < 3; i++) + { + uint64_t r_i = r[i + 4]; + r_i = add_overflow (r_i, carry, &carry); + + uint64_t t2_bits = (upper[5 + i] >> 32) + (upper[6 + i] << 32); + uint64_t t3_bits = (upper[i] >> 16) + (upper[1 + i] << 48); + + r_i = add_carry (r_i, t2_bits, &carry); + r_i = add_carry (r_i, t3_bits, &carry); + + r[i + 4] = r_i; + } + { + uint64_t r_7 = r[7]; + r_7 = add_overflow (r_7, carry, &carry); + + uint64_t t2_bits = (upper[8] >> 32); + uint64_t t3_bits = (upper[3] >> 16) + (upper[4] << 48); + + r_7 = add_carry (r_7, t2_bits, &carry); + r_7 = add_carry (r_7, t3_bits, &carry); + + r[7] = r_7; + } + { + uint64_t r_8 = r[8]; + r_8 = add_overflow (r_8, carry, &carry); + + uint64_t t3_bits = (upper[4] >> 16) + (upper[5] << 48); + + r_8 = add_carry (r_8, t3_bits, &carry); + + r[8] = r_8; + } + c += carry; + + // c = floor(r / 2 ** 576) has been computed along the way via the carry + // flags. Now if c = 0 and the value currently stored in r is greater or + // equal to m, we need cbar = 1 and subtract m, otherwise cbar = c. The + // value currently in r is greater or equal to m, if and only if one of + // the last 240 bits is set and the upper bits are all set. + int greater_m = r[0] | r[1] | r[2] | (r[3] & 0x0000ffffffffffff); + greater_m &= (r[3] >> 48) == 0xffff; + for (int i = 4; i < 9; i++) + { + greater_m &= (r[i] == UINT64_MAX); + } + return c + (c == 0 && greater_m); +} + +#endif diff --git a/rng/ranluxpp/mulmod.h b/rng/ranluxpp/mulmod.h new file mode 100644 index 00000000..f39d1799 --- /dev/null +++ b/rng/ranluxpp/mulmod.h @@ -0,0 +1,273 @@ +/* rng/ranluxpp/mulmod.h + * + * Copyright (C) 2020, 2021 Jonas Hahnfeld + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef RANLUXPP_MULMOD_H +#define RANLUXPP_MULMOD_H + +#include "helpers.h" + +#include + +/// Multiply two 576 bit numbers, stored as 9 numbers of 64 bits each +/// +/// \param[in] in1 first factor as 9 numbers of 64 bits each +/// \param[in] in2 second factor as 9 numbers of 64 bits each +/// \param[out] out result with 18 numbers of 64 bits each +static void +multiply9x9 (const uint64_t * in1, const uint64_t * in2, uint64_t * out) +{ + uint64_t next = 0; + unsigned nextCarry = 0; + +#if defined(__clang__) || defined(__INTEL_COMPILER) +#pragma unroll +#elif defined(__GNUC__) && __GNUC__ >= 8 +// This pragma was introduced in GCC version 8. +#pragma GCC unroll 18 +#endif + for (int i = 0; i < 18; i++) + { + uint64_t current = next; + unsigned carry = nextCarry; + + next = 0; + nextCarry = 0; + +#if defined(__clang__) || defined(__INTEL_COMPILER) +#pragma unroll +#elif defined(__GNUC__) && __GNUC__ >= 8 +// This pragma was introduced in GCC version 8. +#pragma GCC unroll 9 +#endif + for (int j = 0; j < 9; j++) + { + int k = i - j; + if (k < 0 || k >= 9) + continue; + + uint64_t fac1 = in1[j]; + uint64_t fac2 = in2[k]; +#if defined(__SIZEOF_INT128__) && !defined(RANLUXPP_NO_INT128) + unsigned __int128 prod = fac1; + prod = prod * fac2; + + uint64_t upper = prod >> 64; + uint64_t lower = (uint64_t) prod; +#else + uint64_t upper1 = fac1 >> 32; + uint64_t lower1 = (uint32_t) fac1; + + uint64_t upper2 = fac2 >> 32; + uint64_t lower2 = (uint32_t) fac2; + + // Multiply 32-bit parts, each product has a maximum value of + // (2 ** 32 - 1) ** 2 = 2 ** 64 - 2 * 2 ** 32 + 1. + uint64_t upper = upper1 * upper2; + uint64_t middle1 = upper1 * lower2; + uint64_t middle2 = lower1 * upper2; + uint64_t lower = lower1 * lower2; + + // When adding the two products, the maximum value for middle is + // 2 * 2 ** 64 - 4 * 2 ** 32 + 2, which exceeds a uint64_t. + unsigned overflow; + uint64_t middle = add_overflow (middle1, middle2, &overflow); + // Handling the overflow by a multiplication with 0 or 1 is cheaper + // than branching with an if statement, which the compiler does not + // optimize to this equivalent code. Note that we could do entirely + // without this overflow handling when summing up the intermediate + // products differently as described in the following SO answer: + // https://stackoverflow.com/a/51587262 + // However, this approach takes at least the same amount of thinking + // why a) the code gives the same results without b) overflowing due + // to the mixture of 32 bit arithmetic. Moreover, my tests show that + // the scheme implemented here is actually slightly more performant. + uint64_t overflow_add = overflow * (uint64_t (1) << 32); + // This addition can never overflow because the maximum value of upper + // is 2 ** 64 - 2 * 2 ** 32 + 1 (see above). When now adding another + // 2 ** 32, the result is 2 ** 64 - 2 ** 32 + 1 and still smaller than + // the maximum 2 ** 64 - 1 that can be stored in a uint64_t. + upper += overflow_add; + + uint64_t middle_upper = middle >> 32; + uint64_t middle_lower = middle << 32; + + lower = add_overflow (lower, middle_lower, &overflow); + upper += overflow; + + // This still can't overflow since the maximum of middle_upper is + // - 2 ** 32 - 4 if there was an overflow for middle above, bringing + // the maximum value of upper to 2 ** 64 - 2. + // - otherwise upper still has the initial maximum value given above + // and the addition of a value smaller than 2 ** 32 brings it to + // a maximum value of 2 ** 64 - 2 ** 32 + 2. + // (Both cases include the increment to handle the overflow in lower.) + // + // All the reasoning makes perfect sense given that the product of two + // 64 bit numbers is smaller than or equal to + // (2 ** 64 - 1) ** 2 = 2 ** 128 - 2 * 2 ** 64 + 1 + // with the upper bits matching the 2 ** 64 - 2 of the first case. + upper += middle_upper; +#endif + + // Add to current, remember carry. + current = add_carry (current, lower, &carry); + + // Add to next, remember nextCarry. + next = add_carry (next, upper, &nextCarry); + } + + next = add_carry (next, carry, &nextCarry); + + out[i] = current; + } +} + +/// Compute a value congruent to mul modulo m less than 2 ** 576 +/// +/// \param[in] mul product from multiply9x9 with 18 numbers of 64 bits each +/// \param[out] out result with 9 numbers of 64 bits each +/// +/// \f$ m = 2^{576} - 2^{240} + 1 \f$ +/// +/// The result in out is guaranteed to be smaller than the modulus. +static void +mod_m (const uint64_t * mul, uint64_t * out) +{ + uint64_t r[9]; + // Assign r = t0 + for (int i = 0; i < 9; i++) + { + r[i] = mul[i]; + } + + int64_t c = compute_r (mul + 9, r); + + // To update r = r - c * m, it suffices to know c * (-2 ** 240 + 1) + // because the 2 ** 576 will cancel out. Also note that c may be zero, but + // the operation is still performed to avoid branching. + + // c * (-2 ** 240 + 1) in 576 bits looks as follows, depending on c: + // - if c = 0, the number is zero. + // - if c = 1: bits 576 to 240 are set, + // bits 239 to 1 are zero, and + // the last one is set + // - if c = -1, which corresponds to all bits set (signed int64_t): + // bits 576 to 240 are zero and the rest is set. + // Note that all bits except the last are exactly complimentary (unless c = 0) + // and the last byte is conveniently represented by c already. + // Now construct the three bit patterns from c, their names correspond to the + // assembly implementation by Alexei Sibidanov. + + // c = 0 -> t0 = 0; c = 1 -> t0 = 0; c = -1 -> all bits set (sign extension) + // (The assembly implementation shifts by 63, which gives the same result.) + int64_t t0 = c >> 1; + + // Left shifting negative values is undefined behavior until C++20, cast to + // unsigned. + uint64_t c_unsigned = (uint64_t) c; + + // c = 0 -> t2 = 0; c = 1 -> upper 16 bits set; c = -1 -> lower 48 bits set + int64_t t2 = t0 - (c_unsigned << 48); + + // c = 0 -> t1 = 0; c = 1 -> all bits set (sign extension); c = -1 -> t1 = 0 + // (The assembly implementation shifts by 63, which gives the same result.) + int64_t t1 = t2 >> 48; + + unsigned carry = 0; + { + uint64_t r_0 = r[0]; + + uint64_t out_0 = sub_carry (r_0, c, &carry); + out[0] = out_0; + } + for (int i = 1; i < 3; i++) + { + uint64_t r_i = r[i]; + r_i = sub_overflow (r_i, carry, &carry); + + uint64_t out_i = sub_carry (r_i, t0, &carry); + out[i] = out_i; + } + { + uint64_t r_3 = r[3]; + r_3 = sub_overflow (r_3, carry, &carry); + + uint64_t out_3 = sub_carry (r_3, t2, &carry); + out[3] = out_3; + } + for (int i = 4; i < 9; i++) + { + uint64_t r_i = r[i]; + r_i = sub_overflow (r_i, carry, &carry); + + uint64_t out_i = sub_carry (r_i, t1, &carry); + out[i] = out_i; + } +} + +/// Combine multiply9x9 and mod_m with internal temporary storage +/// +/// \param[in] in1 first factor with 9 numbers of 64 bits each +/// \param[inout] inout second factor and also the output of the same size +/// +/// The result in inout is guaranteed to be smaller than the modulus. +static void +mulmod (const uint64_t * in1, uint64_t * inout) +{ + uint64_t mul[2 * 9] = { 0 }; + multiply9x9 (in1, inout, mul); + mod_m (mul, inout); +} + +/// Compute base to the n modulo m +/// +/// \param[in] base with 9 numbers of 64 bits each +/// \param[out] res output with 9 numbers of 64 bits each +/// \param[in] n exponent +/// +/// The arguments base and res may point to the same location. +static void +powermod (const uint64_t * base, uint64_t * res, uint64_t n) +{ + uint64_t fac[9] = { 0 }; + fac[0] = base[0]; + res[0] = 1; + for (int i = 1; i < 9; i++) + { + fac[i] = base[i]; + res[i] = 0; + } + + uint64_t mul[18] = { 0 }; + while (n) + { + if (n & 1) + { + multiply9x9 (res, fac, mul); + mod_m (mul, res); + } + n >>= 1; + if (!n) + break; + multiply9x9 (fac, fac, mul); + mod_m (mul, fac); + } +} + +#endif diff --git a/rng/ranluxpp/ranlux_lcg.h b/rng/ranluxpp/ranlux_lcg.h new file mode 100644 index 00000000..4d5c5715 --- /dev/null +++ b/rng/ranluxpp/ranlux_lcg.h @@ -0,0 +1,110 @@ +/* rng/ranluxpp/ranlux_lcg.h + * + * Copyright (C) 2020, 2021 Jonas Hahnfeld + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef RANLUXPP_RANLUX_LCG_H +#define RANLUXPP_RANLUX_LCG_H + +#include "helpers.h" + +#include + +/// Convert RANLUX numbers to an LCG state +/// +/// \param[in] ranlux the RANLUX numbers as 576 bits +/// \param[out] lcg the 576 bits of the LCG state, smaller than m +/// \param[in] c the carry bit of the RANLUX state +/// +/// \f$ m = 2^{576} - 2^{240} + 1 \f$ +static void +to_lcg (const uint64_t * ranlux, unsigned c, uint64_t * lcg) +{ + unsigned carry = 0; + // Subtract the final 240 bits. + for (int i = 0; i < 9; i++) + { + uint64_t ranlux_i = ranlux[i]; + uint64_t lcg_i = sub_overflow (ranlux_i, carry, &carry); + + uint64_t bits = 0; + if (i < 4) + { + bits += ranlux[i + 5] >> 16; + if (i < 3) + { + bits += ranlux[i + 6] << 48; + } + } + lcg_i = sub_carry (lcg_i, bits, &carry); + lcg[i] = lcg_i; + } + + // Add and propagate the carry bit. + for (int i = 0; i < 9; i++) + { + lcg[i] = add_overflow (lcg[i], c, &c); + } +} + +/// Convert an LCG state to RANLUX numbers +/// +/// \param[in] lcg the 576 bits of the LCG state, must be smaller than m +/// \param[out] ranlux the RANLUX numbers as 576 bits +/// \param[out] c the carry bit of the RANLUX state +/// +/// \f$ m = 2^{576} - 2^{240} + 1 \f$ +static void +to_ranlux (const uint64_t * lcg, uint64_t * ranlux, unsigned *c_out) +{ + uint64_t r[9] = { 0 }; + int64_t c = compute_r (lcg, r); + + // ranlux = t1 + t2 + c + unsigned carry = 0; + for (int i = 0; i < 9; i++) + { + uint64_t in_i = lcg[i]; + uint64_t tmp_i = add_overflow (in_i, carry, &carry); + + uint64_t bits = 0; + if (i < 4) + { + bits += lcg[i + 5] >> 16; + if (i < 3) + { + bits += lcg[i + 6] << 48; + } + } + tmp_i = add_carry (tmp_i, bits, &carry); + ranlux[i] = tmp_i; + } + + // If c = -1, we need to add it to all components. + int64_t c1 = c >> 1; + ranlux[0] = add_overflow (ranlux[0], c, &carry); + for (int i = 1; i < 9; i++) + { + uint64_t ranlux_i = ranlux[i]; + ranlux_i = add_overflow (ranlux_i, carry, &carry); + ranlux_i = add_carry (ranlux_i, c1, &carry); + } + + *c_out = carry; +} + +#endif diff --git a/rng/test.c b/rng/test.c index f82ad1ae..96e374cc 100644 --- a/rng/test.c +++ b/rng/test.c @@ -128,6 +128,12 @@ main (void) rng_test (gsl_rng_ranlxd2, 1, 10000, 3949287736UL); /* 0.919515205581550532 * ldexp(1.0,32) */ + /* The number comes from running + RanluxppEngine rng(314159265); + rng.Skip(9999); + std::cout << rng.IntRndm() << std::endl; */ + rng_test (gsl_rng_ranluxpp, 314159265, 10000, 176636690327553UL); + /* FIXME: the tests below were made by running the original code in the ../random directory and getting the expected value from that. An analytic calculation would be preferable. */ diff --git a/rng/types.c b/rng/types.c index 071edcf6..7ce8eb3b 100644 --- a/rng/types.c +++ b/rng/types.c @@ -82,6 +82,7 @@ gsl_rng_types_setup (void) ADD(gsl_rng_ranlxs0); ADD(gsl_rng_ranlxs1); ADD(gsl_rng_ranlxs2); + ADD(gsl_rng_ranluxpp); ADD(gsl_rng_ranmar); ADD(gsl_rng_slatec); ADD(gsl_rng_taus); -- 2.29.2