aboutsummaryrefslogtreecommitdiff
path: root/libphobos/src/std/internal/math
diff options
context:
space:
mode:
authorIain Buclaw <ibuclaw@gcc.gnu.org>2018-10-28 19:51:47 +0000
committerIain Buclaw <ibuclaw@gcc.gnu.org>2018-10-28 19:51:47 +0000
commitb4c522fabd0df7be08882d2207df8b2765026110 (patch)
treeb5ffc312b0a441c1ba24323152aec463fdbe5e9f /libphobos/src/std/internal/math
parent01ce9e31a02c8039d88e90f983735104417bf034 (diff)
downloadgcc-b4c522fabd0df7be08882d2207df8b2765026110.zip
gcc-b4c522fabd0df7be08882d2207df8b2765026110.tar.gz
gcc-b4c522fabd0df7be08882d2207df8b2765026110.tar.bz2
Add D front-end, libphobos library, and D2 testsuite.
ChangeLog: * Makefile.def (target_modules): Add libphobos. (flags_to_pass): Add GDC, GDCFLAGS, GDC_FOR_TARGET and GDCFLAGS_FOR_TARGET. (dependencies): Make libphobos depend on libatomic, libbacktrace configure, and zlib configure. (language): Add language d. * Makefile.in: Rebuild. * Makefile.tpl (BUILD_EXPORTS): Add GDC and GDCFLAGS. (HOST_EXPORTS): Add GDC. (POSTSTAGE1_HOST_EXPORTS): Add GDC and GDC_FOR_BUILD. (BASE_TARGET_EXPORTS): Add GDC. (GDC_FOR_BUILD, GDC, GDCFLAGS): New variables. (GDC_FOR_TARGET, GDC_FLAGS_FOR_TARGET): New variables. (EXTRA_HOST_FLAGS): Add GDC. (STAGE1_FLAGS_TO_PASS): Add GDC. (EXTRA_TARGET_FLAGS): Add GDC and GDCFLAGS. * config-ml.in: Treat GDC and GDCFLAGS like other compiler/flag environment variables. * configure: Rebuild. * configure.ac: Add target-libphobos to target_libraries. Set and substitute GDC_FOR_BUILD and GDC_FOR_TARGET. config/ChangeLog: * multi.m4: Set GDC. gcc/ChangeLog: * Makefile.in (tm_d_file_list, tm_d_include_list): New variables. (TM_D_H, D_TARGET_DEF, D_TARGET_H, D_TARGET_OBJS): New variables. (tm_d.h, cs-tm_d.h, default-d.o): New rules. (d/d-target-hooks-def.h, s-d-target-hooks-def-h): New rules. (s-tm-texi): Also check timestamp on d-target.def. (generated_files): Add TM_D_H and d-target-hooks-def.h. (build/genhooks.o): Also depend on D_TARGET_DEF. * config.gcc (tm_d_file, d_target_objs, target_has_targetdm): New variables. * config/aarch64/aarch64-d.c: New file. * config/aarch64/aarch64-linux.h (GNU_USER_TARGET_D_CRITSEC_SIZE): Define. * config/aarch64/aarch64-protos.h (aarch64_d_target_versions): New prototype. * config/aarch64/aarch64.h (TARGET_D_CPU_VERSIONS): Define. * config/aarch64/t-aarch64 (aarch64-d.o): New rule. * config/arm/arm-d.c: New file. * config/arm/arm-protos.h (arm_d_target_versions): New prototype. * config/arm/arm.h (TARGET_D_CPU_VERSIONS): Define. * config/arm/linux-eabi.h (EXTRA_TARGET_D_OS_VERSIONS): Define. * config/arm/t-arm (arm-d.o): New rule. * config/default-d.c: New file. * config/glibc-d.c: New file. * config/gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/i386/i386-d.c: New file. * config/i386/i386-protos.h (ix86_d_target_versions): New prototype. * config/i386/i386.h (TARGET_D_CPU_VERSIONS): Define. * config/i386/linux-common.h (EXTRA_TARGET_D_OS_VERSIONS): Define. (GNU_USER_TARGET_D_CRITSEC_SIZE): Define. * config/i386/t-i386 (i386-d.o): New rule. * config/kfreebsd-gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/kopensolaris-gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/linux-android.h (ANDROID_TARGET_D_OS_VERSIONS): Define. * config/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/mips/linux-common.h (EXTRA_TARGET_D_OS_VERSIONS): Define. * config/mips/mips-d.c: New file. * config/mips/mips-protos.h (mips_d_target_versions): New prototype. * config/mips/mips.h (TARGET_D_CPU_VERSIONS): Define. * config/mips/t-mips (mips-d.o): New rule. * config/powerpcspe/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/powerpcspe/linux64.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/powerpcspe/powerpcspe-d.c: New file. * config/powerpcspe/powerpcspe-protos.h (rs6000_d_target_versions): New prototype. * config/powerpcspe/powerpcspe.c (rs6000_output_function_epilogue): Support GNU D by using 0 as the language type. * config/powerpcspe/powerpcspe.h (TARGET_D_CPU_VERSIONS): Define. * config/powerpcspe/t-powerpcspe (powerpcspe-d.o): New rule. * config/riscv/riscv-d.c: New file. * config/riscv/riscv-protos.h (riscv_d_target_versions): New prototype. * config/riscv/riscv.h (TARGET_D_CPU_VERSIONS): Define. * config/riscv/t-riscv (riscv-d.o): New rule. * config/rs6000/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/rs6000/linux64.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/rs6000/rs6000-d.c: New file. * config/rs6000/rs6000-protos.h (rs6000_d_target_versions): New prototype. * config/rs6000/rs6000.c (rs6000_output_function_epilogue): Support GNU D by using 0 as the language type. * config/rs6000/rs6000.h (TARGET_D_CPU_VERSIONS): Define. * config/rs6000/t-rs6000 (rs6000-d.o): New rule. * config/s390/s390-d.c: New file. * config/s390/s390-protos.h (s390_d_target_versions): New prototype. * config/s390/s390.h (TARGET_D_CPU_VERSIONS): Define. * config/s390/t-s390 (s390-d.o): New rule. * config/sparc/sparc-d.c: New file. * config/sparc/sparc-protos.h (sparc_d_target_versions): New prototype. * config/sparc/sparc.h (TARGET_D_CPU_VERSIONS): Define. * config/sparc/t-sparc (sparc-d.o): New rule. * config/t-glibc (glibc-d.o): New rule. * configure: Regenerated. * configure.ac (tm_d_file): New variable. (tm_d_file_list, tm_d_include_list, d_target_objs): Add substitutes. * doc/contrib.texi (Contributors): Add self for the D frontend. * doc/frontends.texi (G++ and GCC): Mention D as a supported language. * doc/install.texi (Configuration): Mention libphobos as an option for --enable-shared. Mention d as an option for --enable-languages. (Testing): Mention check-d as a target. * doc/invoke.texi (Overall Options): Mention .d, .dd, and .di as file name suffixes. Mention d as a -x option. * doc/sourcebuild.texi (Top Level): Mention libphobos. * doc/standards.texi (Standards): Add section on D language. * doc/tm.texi: Regenerated. * doc/tm.texi.in: Add @node for D language and ABI, and @hook for TARGET_CPU_VERSIONS, TARGET_D_OS_VERSIONS, and TARGET_D_CRITSEC_SIZE. * dwarf2out.c (is_dlang): New function. (gen_compile_unit_die): Use DW_LANG_D for D. (declare_in_namespace): Return module die for D, instead of adding extra declarations into the namespace. (gen_namespace_die): Generate DW_TAG_module for D. (gen_decl_die): Handle CONST_DECLSs for D. (dwarf2out_decl): Likewise. (prune_unused_types_walk_local_classes): Handle DW_tag_interface_type. (prune_unused_types_walk): Handle DW_tag_interface_type same as other kinds of aggregates. * gcc.c (default_compilers): Add entries for .d, .dd and .di. * genhooks.c: Include d/d-target.def. gcc/po/ChangeLog: * EXCLUDES: Add sources from d/dmd. gcc/testsuite/ChangeLog: * gcc.misc-tests/help.exp: Add D to option descriptions check. * gdc.dg/asan/asan.exp: New file. * gdc.dg/asan/gdc272.d: New test. * gdc.dg/compilable.d: New test. * gdc.dg/dg.exp: New file. * gdc.dg/gdc254.d: New test. * gdc.dg/gdc260.d: New test. * gdc.dg/gdc270a.d: New test. * gdc.dg/gdc270b.d: New test. * gdc.dg/gdc282.d: New test. * gdc.dg/gdc283.d: New test. * gdc.dg/imports/gdc170.d: New test. * gdc.dg/imports/gdc231.d: New test. * gdc.dg/imports/gdc239.d: New test. * gdc.dg/imports/gdc241a.d: New test. * gdc.dg/imports/gdc241b.d: New test. * gdc.dg/imports/gdc251a.d: New test. * gdc.dg/imports/gdc251b.d: New test. * gdc.dg/imports/gdc253.d: New test. * gdc.dg/imports/gdc254a.d: New test. * gdc.dg/imports/gdc256.d: New test. * gdc.dg/imports/gdc27.d: New test. * gdc.dg/imports/gdcpkg256/package.d: New test. * gdc.dg/imports/runnable.d: New test. * gdc.dg/link.d: New test. * gdc.dg/lto/lto.exp: New file. * gdc.dg/lto/ltotests_0.d: New test. * gdc.dg/lto/ltotests_1.d: New test. * gdc.dg/runnable.d: New test. * gdc.dg/simd.d: New test. * gdc.test/gdc-test.exp: New file. * lib/gdc-dg.exp: New file. * lib/gdc.exp: New file. libphobos/ChangeLog: * Makefile.am: New file. * Makefile.in: New file. * acinclude.m4: New file. * aclocal.m4: New file. * config.h.in: New file. * configure: New file. * configure.ac: New file. * d_rules.am: New file. * libdruntime/Makefile.am: New file. * libdruntime/Makefile.in: New file. * libdruntime/__entrypoint.di: New file. * libdruntime/__main.di: New file. * libdruntime/gcc/attribute.d: New file. * libdruntime/gcc/backtrace.d: New file. * libdruntime/gcc/builtins.d: New file. * libdruntime/gcc/config.d.in: New file. * libdruntime/gcc/deh.d: New file. * libdruntime/gcc/libbacktrace.d.in: New file. * libdruntime/gcc/unwind/arm.d: New file. * libdruntime/gcc/unwind/arm_common.d: New file. * libdruntime/gcc/unwind/c6x.d: New file. * libdruntime/gcc/unwind/generic.d: New file. * libdruntime/gcc/unwind/package.d: New file. * libdruntime/gcc/unwind/pe.d: New file. * m4/autoconf.m4: New file. * m4/druntime.m4: New file. * m4/druntime/cpu.m4: New file. * m4/druntime/libraries.m4: New file. * m4/druntime/os.m4: New file. * m4/gcc_support.m4: New file. * m4/gdc.m4: New file. * m4/libtool.m4: New file. * src/Makefile.am: New file. * src/Makefile.in: New file. * src/libgphobos.spec.in: New file. * testsuite/Makefile.am: New file. * testsuite/Makefile.in: New file. * testsuite/config/default.exp: New file. * testsuite/lib/libphobos-dg.exp: New file. * testsuite/lib/libphobos.exp: New file. * testsuite/testsuite_flags.in: New file. From-SVN: r265573
Diffstat (limited to 'libphobos/src/std/internal/math')
-rw-r--r--libphobos/src/std/internal/math/biguintcore.d2571
-rw-r--r--libphobos/src/std/internal/math/biguintnoasm.d370
-rw-r--r--libphobos/src/std/internal/math/biguintx86.d1353
-rw-r--r--libphobos/src/std/internal/math/errorfunction.d1145
-rw-r--r--libphobos/src/std/internal/math/gammafunction.d1834
5 files changed, 7273 insertions, 0 deletions
diff --git a/libphobos/src/std/internal/math/biguintcore.d b/libphobos/src/std/internal/math/biguintcore.d
new file mode 100644
index 0000000..f5cd769
--- /dev/null
+++ b/libphobos/src/std/internal/math/biguintcore.d
@@ -0,0 +1,2571 @@
+/** Fundamental operations for arbitrary-precision arithmetic
+ *
+ * These functions are for internal use only.
+ */
+/* Copyright Don Clugston 2008 - 2010.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+/* References:
+ "Modern Computer Arithmetic" (MCA) is the primary reference for all
+ algorithms used in this library.
+ - R.P. Brent and P. Zimmermann, "Modern Computer Arithmetic",
+ Version 0.5.9, (Oct 2010).
+ - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022,
+ Max-Planck Institute fuer Informatik, (Oct 1998).
+ - G. Hanrot, M. Quercia, and P. Zimmermann, "The Middle Product Algorithm, I.",
+ INRIA 4664, (Dec 2002).
+ - M. Bodrato and A. Zanoni, "What about Toom-Cook Matrices Optimality?",
+ http://bodrato.it/papers (2006).
+ - A. Fog, "Optimizing subroutines in assembly language",
+ www.agner.org/optimize (2008).
+ - A. Fog, "The microarchitecture of Intel and AMD CPU's",
+ www.agner.org/optimize (2008).
+ - A. Fog, "Instruction tables: Lists of instruction latencies, throughputs
+ and micro-operation breakdowns for Intel and AMD CPU's.", www.agner.org/optimize (2008).
+
+Idioms:
+ Many functions in this module use
+ 'func(Tulong)(Tulong x) if (is(Tulong == ulong))' rather than 'func(ulong x)'
+ in order to disable implicit conversion.
+
+*/
+module std.internal.math.biguintcore;
+
+version (D_InlineAsm_X86)
+{
+ import std.internal.math.biguintx86;
+}
+else
+{
+ import std.internal.math.biguintnoasm;
+}
+
+alias multibyteAdd = multibyteAddSub!('+');
+alias multibyteSub = multibyteAddSub!('-');
+
+
+import core.cpuid;
+public import std.ascii : LetterCase;
+import std.range.primitives;
+import std.traits;
+
+shared static this()
+{
+ CACHELIMIT = core.cpuid.datacache[0].size*1024/2;
+}
+
+private:
+// Limits for when to switch between algorithms.
+immutable size_t CACHELIMIT; // Half the size of the data cache.
+enum size_t FASTDIVLIMIT = 100; // crossover to recursive division
+
+
+// These constants are used by shift operations
+static if (BigDigit.sizeof == int.sizeof)
+{
+ enum { LG2BIGDIGITBITS = 5, BIGDIGITSHIFTMASK = 31 };
+ alias BIGHALFDIGIT = ushort;
+}
+else static if (BigDigit.sizeof == long.sizeof)
+{
+ alias BIGHALFDIGIT = uint;
+ enum { LG2BIGDIGITBITS = 6, BIGDIGITSHIFTMASK = 63 };
+}
+else static assert(0, "Unsupported BigDigit size");
+
+import std.exception : assumeUnique;
+import std.traits : isIntegral;
+enum BigDigitBits = BigDigit.sizeof*8;
+template maxBigDigits(T)
+if (isIntegral!T)
+{
+ enum maxBigDigits = (T.sizeof+BigDigit.sizeof-1)/BigDigit.sizeof;
+}
+
+static immutable BigDigit[] ZERO = [0];
+static immutable BigDigit[] ONE = [1];
+static immutable BigDigit[] TWO = [2];
+static immutable BigDigit[] TEN = [10];
+
+
+public:
+
+/// BigUint performs memory management and wraps the low-level calls.
+struct BigUint
+{
+private:
+ pure invariant()
+ {
+ assert( data.length >= 1 && (data.length == 1 || data[$-1] != 0 ));
+ }
+
+ immutable(BigDigit) [] data = ZERO;
+
+ this(immutable(BigDigit) [] x) pure nothrow @nogc @safe
+ {
+ data = x;
+ }
+ package(std) // used from: std.bigint
+ this(T)(T x) pure nothrow @safe if (isIntegral!T)
+ {
+ opAssign(x);
+ }
+
+ enum trustedAssumeUnique = function(BigDigit[] input) pure @trusted @nogc {
+ return assumeUnique(input);
+ };
+public:
+ // Length in uints
+ @property size_t uintLength() pure nothrow const @safe @nogc
+ {
+ static if (BigDigit.sizeof == uint.sizeof)
+ {
+ return data.length;
+ }
+ else static if (BigDigit.sizeof == ulong.sizeof)
+ {
+ return data.length * 2 -
+ ((data[$-1] & 0xFFFF_FFFF_0000_0000L) ? 1 : 0);
+ }
+ }
+ @property size_t ulongLength() pure nothrow const @safe @nogc
+ {
+ static if (BigDigit.sizeof == uint.sizeof)
+ {
+ return (data.length + 1) >> 1;
+ }
+ else static if (BigDigit.sizeof == ulong.sizeof)
+ {
+ return data.length;
+ }
+ }
+
+ // The value at (cast(ulong[]) data)[n]
+ ulong peekUlong(int n) pure nothrow const @safe @nogc
+ {
+ static if (BigDigit.sizeof == int.sizeof)
+ {
+ if (data.length == n*2 + 1) return data[n*2];
+ return data[n*2] + ((cast(ulong) data[n*2 + 1]) << 32 );
+ }
+ else static if (BigDigit.sizeof == long.sizeof)
+ {
+ return data[n];
+ }
+ }
+ uint peekUint(int n) pure nothrow const @safe @nogc
+ {
+ static if (BigDigit.sizeof == int.sizeof)
+ {
+ return data[n];
+ }
+ else
+ {
+ immutable x = data[n >> 1];
+ return (n & 1) ? cast(uint)(x >> 32) : cast(uint) x;
+ }
+ }
+public:
+ ///
+ void opAssign(Tulong)(Tulong u) pure nothrow @safe if (is (Tulong == ulong))
+ {
+ if (u == 0) data = ZERO;
+ else if (u == 1) data = ONE;
+ else if (u == 2) data = TWO;
+ else if (u == 10) data = TEN;
+ else
+ {
+ static if (BigDigit.sizeof == int.sizeof)
+ {
+ uint ulo = cast(uint)(u & 0xFFFF_FFFF);
+ uint uhi = cast(uint)(u >> 32);
+ if (uhi == 0)
+ {
+ data = [ulo];
+ }
+ else
+ {
+ data = [ulo, uhi];
+ }
+ }
+ else static if (BigDigit.sizeof == long.sizeof)
+ {
+ data = [u];
+ }
+ }
+ }
+ void opAssign(Tdummy = void)(BigUint y) pure nothrow @nogc @safe
+ {
+ this.data = y.data;
+ }
+
+ ///
+ int opCmp(Tdummy = void)(const BigUint y) pure nothrow @nogc const @safe
+ {
+ if (data.length != y.data.length)
+ return (data.length > y.data.length) ? 1 : -1;
+ size_t k = highestDifferentDigit(data, y.data);
+ if (data[k] == y.data[k])
+ return 0;
+ return data[k] > y.data[k] ? 1 : -1;
+ }
+
+ ///
+ int opCmp(Tulong)(Tulong y) pure nothrow @nogc const @safe if (is (Tulong == ulong))
+ {
+ if (data.length > maxBigDigits!Tulong)
+ return 1;
+
+ foreach_reverse (i; 0 .. maxBigDigits!Tulong)
+ {
+ BigDigit tmp = cast(BigDigit)(y>>(i*BigDigitBits));
+ if (tmp == 0)
+ if (data.length >= i+1)
+ {
+ // Since ZERO is [0], so we cannot simply return 1 here, as
+ // data[i] would be 0 for i == 0 in that case.
+ return (data[i] > 0) ? 1 : 0;
+ }
+ else
+ continue;
+ else
+ if (i+1 > data.length)
+ return -1;
+ else if (tmp != data[i])
+ return data[i] > tmp ? 1 : -1;
+ }
+ return 0;
+ }
+
+ bool opEquals(Tdummy = void)(ref const BigUint y) pure nothrow @nogc const @safe
+ {
+ return y.data[] == data[];
+ }
+
+ bool opEquals(Tdummy = void)(ulong y) pure nothrow @nogc const @safe
+ {
+ if (data.length > 2)
+ return false;
+ uint ylo = cast(uint)(y & 0xFFFF_FFFF);
+ uint yhi = cast(uint)(y >> 32);
+ if (data.length == 2 && data[1]!=yhi)
+ return false;
+ if (data.length == 1 && yhi != 0)
+ return false;
+ return (data[0] == ylo);
+ }
+
+ bool isZero() pure const nothrow @safe @nogc
+ {
+ return data.length == 1 && data[0] == 0;
+ }
+
+ size_t numBytes() pure nothrow const @safe @nogc
+ {
+ return data.length * BigDigit.sizeof;
+ }
+
+ // the extra bytes are added to the start of the string
+ char [] toDecimalString(int frontExtraBytes) const pure nothrow
+ {
+ immutable predictlength = 20+20*(data.length/2); // just over 19
+ char [] buff = new char[frontExtraBytes + predictlength];
+ ptrdiff_t sofar = biguintToDecimal(buff, data.dup);
+ return buff[sofar-frontExtraBytes..$];
+ }
+
+ /** Convert to a hex string, printing a minimum number of digits 'minPadding',
+ * allocating an additional 'frontExtraBytes' at the start of the string.
+ * Padding is done with padChar, which may be '0' or ' '.
+ * 'separator' is a digit separation character. If non-zero, it is inserted
+ * between every 8 digits.
+ * Separator characters do not contribute to the minPadding.
+ */
+ char [] toHexString(int frontExtraBytes, char separator = 0,
+ int minPadding=0, char padChar = '0',
+ LetterCase letterCase = LetterCase.upper) const pure nothrow @safe
+ {
+ // Calculate number of extra padding bytes
+ size_t extraPad = (minPadding > data.length * 2 * BigDigit.sizeof)
+ ? minPadding - data.length * 2 * BigDigit.sizeof : 0;
+
+ // Length not including separator bytes
+ size_t lenBytes = data.length * 2 * BigDigit.sizeof;
+
+ // Calculate number of separator bytes
+ size_t mainSeparatorBytes = separator ? (lenBytes / 8) - 1 : 0;
+ immutable totalSeparatorBytes = separator ? ((extraPad + lenBytes + 7) / 8) - 1: 0;
+
+ char [] buff = new char[lenBytes + extraPad + totalSeparatorBytes + frontExtraBytes];
+ biguintToHex(buff[$ - lenBytes - mainSeparatorBytes .. $], data, separator, letterCase);
+ if (extraPad > 0)
+ {
+ if (separator)
+ {
+ size_t start = frontExtraBytes; // first index to pad
+ if (extraPad &7)
+ {
+ // Do 1 to 7 extra zeros.
+ buff[frontExtraBytes .. frontExtraBytes + (extraPad & 7)] = padChar;
+ buff[frontExtraBytes + (extraPad & 7)] = (padChar == ' ' ? ' ' : separator);
+ start += (extraPad & 7) + 1;
+ }
+ for (int i=0; i< (extraPad >> 3); ++i)
+ {
+ buff[start .. start + 8] = padChar;
+ buff[start + 8] = (padChar == ' ' ? ' ' : separator);
+ start += 9;
+ }
+ }
+ else
+ {
+ buff[frontExtraBytes .. frontExtraBytes + extraPad]=padChar;
+ }
+ }
+ int z = frontExtraBytes;
+ if (lenBytes > minPadding)
+ {
+ // Strip leading zeros.
+ ptrdiff_t maxStrip = lenBytes - minPadding;
+ while (z< buff.length-1 && (buff[z]=='0' || buff[z]==padChar) && maxStrip>0)
+ {
+ ++z;
+ --maxStrip;
+ }
+ }
+ if (padChar!='0')
+ {
+ // Convert leading zeros into padChars.
+ for (size_t k= z; k< buff.length-1 && (buff[k]=='0' || buff[k]==padChar); ++k)
+ {
+ if (buff[k]=='0') buff[k]=padChar;
+ }
+ }
+ return buff[z-frontExtraBytes..$];
+ }
+
+ /**
+ * Convert to an octal string.
+ */
+ char[] toOctalString() const
+ {
+ auto predictLength = 1 + data.length*BigDigitBits / 3;
+ char[] buff = new char[predictLength];
+ size_t firstNonZero = biguintToOctal(buff, data);
+ return buff[firstNonZero .. $];
+ }
+
+ // return false if invalid character found
+ bool fromHexString(Range)(Range s) if (
+ isBidirectionalRange!Range && isSomeChar!(ElementType!Range))
+ {
+ import std.range : walkLength;
+
+ //Strip leading zeros
+ while (!s.empty && s.front == '0')
+ s.popFront;
+
+ if (s.empty)
+ {
+ data = ZERO;
+ return true;
+ }
+
+ immutable len = (s.save.walkLength + 15) / 4;
+ auto tmp = new BigDigit[len + 1];
+ uint part, sofar, partcount;
+
+ foreach_reverse (character; s)
+ {
+ if (character == '_')
+ continue;
+
+ uint x;
+ if (character >= '0' && character <= '9')
+ {
+ x = character - '0';
+ }
+ else if (character >= 'A' && character <= 'F')
+ {
+ x = character - 'A' + 10;
+ }
+ else if (character >= 'a' && character <= 'f')
+ {
+ x = character - 'a' + 10;
+ }
+ else
+ {
+ return false;
+ }
+
+ part >>= 4;
+ part |= (x << (32 - 4));
+ ++partcount;
+
+ if (partcount == 8)
+ {
+ tmp[sofar] = part;
+ ++sofar;
+ partcount = 0;
+ part = 0;
+ }
+ }
+ if (part)
+ {
+ for ( ; partcount != 8; ++partcount) part >>= 4;
+ tmp[sofar] = part;
+ ++sofar;
+ }
+ if (sofar == 0)
+ data = ZERO;
+ else
+ data = trustedAssumeUnique(tmp[0 .. sofar]);
+
+ return true;
+ }
+
+ // return true if OK; false if erroneous characters found
+ bool fromDecimalString(Range)(Range s) if (
+ isForwardRange!Range && isSomeChar!(ElementType!Range))
+ {
+ import std.range : walkLength;
+
+ while (!s.empty && s.front == '0')
+ {
+ s.popFront;
+ }
+
+ if (s.empty)
+ {
+ data = ZERO;
+ return true;
+ }
+
+ auto predict_length = (18 * 2 + 2 * s.save.walkLength) / 19;
+ auto tmp = new BigDigit[predict_length];
+
+ tmp.length = biguintFromDecimal(tmp, s);
+
+ data = trustedAssumeUnique(tmp);
+ return true;
+ }
+
+ ////////////////////////
+ //
+ // All of these member functions create a new BigUint.
+
+ // return x >> y
+ BigUint opShr(Tulong)(Tulong y) pure nothrow const if (is (Tulong == ulong))
+ {
+ assert(y>0);
+ uint bits = cast(uint) y & BIGDIGITSHIFTMASK;
+ if ((y >> LG2BIGDIGITBITS) >= data.length) return BigUint(ZERO);
+ uint words = cast(uint)(y >> LG2BIGDIGITBITS);
+ if (bits == 0)
+ {
+ return BigUint(data[words..$]);
+ }
+ else
+ {
+ uint [] result = new BigDigit[data.length - words];
+ multibyteShr(result, data[words..$], bits);
+
+ if (result.length > 1 && result[$-1] == 0)
+ return BigUint(trustedAssumeUnique(result[0 .. $-1]));
+ else
+ return BigUint(trustedAssumeUnique(result));
+ }
+ }
+
+ // return x << y
+ BigUint opShl(Tulong)(Tulong y) pure nothrow const if (is (Tulong == ulong))
+ {
+ assert(y>0);
+ if (isZero()) return this;
+ uint bits = cast(uint) y & BIGDIGITSHIFTMASK;
+ assert((y >> LG2BIGDIGITBITS) < cast(ulong)(uint.max));
+ uint words = cast(uint)(y >> LG2BIGDIGITBITS);
+ BigDigit [] result = new BigDigit[data.length + words+1];
+ result[0 .. words] = 0;
+ if (bits == 0)
+ {
+ result[words .. words+data.length] = data[];
+ return BigUint(trustedAssumeUnique(result[0 .. words+data.length]));
+ }
+ else
+ {
+ immutable c = multibyteShl(result[words .. words+data.length], data, bits);
+ if (c == 0) return BigUint(trustedAssumeUnique(result[0 .. words+data.length]));
+ result[$-1] = c;
+ return BigUint(trustedAssumeUnique(result));
+ }
+ }
+
+ // If wantSub is false, return x + y, leaving sign unchanged
+ // If wantSub is true, return abs(x - y), negating sign if x < y
+ static BigUint addOrSubInt(Tulong)(const BigUint x, Tulong y,
+ bool wantSub, ref bool sign) pure nothrow if (is(Tulong == ulong))
+ {
+ BigUint r;
+ if (wantSub)
+ { // perform a subtraction
+ if (x.data.length > 2)
+ {
+ r.data = subInt(x.data, y);
+ }
+ else
+ { // could change sign!
+ ulong xx = x.data[0];
+ if (x.data.length > 1)
+ xx += (cast(ulong) x.data[1]) << 32;
+ ulong d;
+ if (xx <= y)
+ {
+ d = y - xx;
+ sign = !sign;
+ }
+ else
+ {
+ d = xx - y;
+ }
+ if (d == 0)
+ {
+ r = 0UL;
+ sign = false;
+ return r;
+ }
+ if (d > uint.max)
+ {
+ r.data = [cast(uint)(d & 0xFFFF_FFFF), cast(uint)(d >> 32)];
+ }
+ else
+ {
+ r.data = [cast(uint)(d & 0xFFFF_FFFF)];
+ }
+ }
+ }
+ else
+ {
+ r.data = addInt(x.data, y);
+ }
+ return r;
+ }
+
+ // If wantSub is false, return x + y, leaving sign unchanged.
+ // If wantSub is true, return abs(x - y), negating sign if x<y
+ static BigUint addOrSub(BigUint x, BigUint y, bool wantSub, bool *sign)
+ pure nothrow
+ {
+ BigUint r;
+ if (wantSub)
+ { // perform a subtraction
+ bool negative;
+ r.data = sub(x.data, y.data, &negative);
+ *sign ^= negative;
+ if (r.isZero())
+ {
+ *sign = false;
+ }
+ }
+ else
+ {
+ r.data = add(x.data, y.data);
+ }
+ return r;
+ }
+
+
+ // return x*y.
+ // y must not be zero.
+ static BigUint mulInt(T = ulong)(BigUint x, T y) pure nothrow
+ {
+ if (y == 0 || x == 0) return BigUint(ZERO);
+ uint hi = cast(uint)(y >>> 32);
+ uint lo = cast(uint)(y & 0xFFFF_FFFF);
+ uint [] result = new BigDigit[x.data.length+1+(hi != 0)];
+ result[x.data.length] = multibyteMul(result[0 .. x.data.length], x.data, lo, 0);
+ if (hi != 0)
+ {
+ result[x.data.length+1] = multibyteMulAdd!('+')(result[1 .. x.data.length+1],
+ x.data, hi, 0);
+ }
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
+ }
+
+ /* return x * y.
+ */
+ static BigUint mul(BigUint x, BigUint y) pure nothrow
+ {
+ if (y == 0 || x == 0)
+ return BigUint(ZERO);
+ auto len = x.data.length + y.data.length;
+ BigDigit [] result = new BigDigit[len];
+ if (y.data.length > x.data.length)
+ {
+ mulInternal(result, y.data, x.data);
+ }
+ else
+ {
+ if (x.data[]==y.data[]) squareInternal(result, x.data);
+ else mulInternal(result, x.data, y.data);
+ }
+ // the highest element could be zero,
+ // in which case we need to reduce the length
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
+ }
+
+ // return x / y
+ static BigUint divInt(T)(BigUint x, T y_) pure nothrow
+ if ( is(Unqual!T == uint) )
+ {
+ uint y = y_;
+ if (y == 1)
+ return x;
+ uint [] result = new BigDigit[x.data.length];
+ if ((y&(-y))==y)
+ {
+ assert(y != 0, "BigUint division by zero");
+ // perfect power of 2
+ uint b = 0;
+ for (;y != 1; y>>=1)
+ {
+ ++b;
+ }
+ multibyteShr(result, x.data, b);
+ }
+ else
+ {
+ result[] = x.data[];
+ cast(void) multibyteDivAssign(result, y, 0);
+ }
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
+ }
+
+ static BigUint divInt(T)(BigUint x, T y) pure nothrow
+ if ( is(Unqual!T == ulong) )
+ {
+ if (y <= uint.max)
+ return divInt!uint(x, cast(uint) y);
+ if (x.data.length < 2)
+ return BigUint(ZERO);
+ uint hi = cast(uint)(y >>> 32);
+ uint lo = cast(uint)(y & 0xFFFF_FFFF);
+ immutable uint[2] z = [lo, hi];
+ BigDigit[] result = new BigDigit[x.data.length - z.length + 1];
+ divModInternal(result, null, x.data, z[]);
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
+ }
+
+ // return x % y
+ static uint modInt(T)(BigUint x, T y_) pure if ( is(Unqual!T == uint) )
+ {
+ import core.memory : GC;
+ uint y = y_;
+ assert(y != 0);
+ if ((y&(-y)) == y)
+ { // perfect power of 2
+ return x.data[0] & (y-1);
+ }
+ else
+ {
+ // horribly inefficient - malloc, copy, & store are unnecessary.
+ uint [] wasteful = new BigDigit[x.data.length];
+ wasteful[] = x.data[];
+ immutable rem = multibyteDivAssign(wasteful, y, 0);
+ () @trusted { GC.free(wasteful.ptr); } ();
+ return rem;
+ }
+ }
+
+ // return x / y
+ static BigUint div(BigUint x, BigUint y) pure nothrow
+ {
+ if (y.data.length > x.data.length)
+ return BigUint(ZERO);
+ if (y.data.length == 1)
+ return divInt(x, y.data[0]);
+ BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
+ divModInternal(result, null, x.data, y.data);
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
+ }
+
+ // return x % y
+ static BigUint mod(BigUint x, BigUint y) pure nothrow
+ {
+ if (y.data.length > x.data.length) return x;
+ if (y.data.length == 1)
+ {
+ return BigUint([modInt(x, y.data[0])]);
+ }
+ BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
+ BigDigit [] rem = new BigDigit[y.data.length];
+ divModInternal(result, rem, x.data, y.data);
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(rem)));
+ }
+
+ // return x op y
+ static BigUint bitwiseOp(string op)(BigUint x, BigUint y, bool xSign, bool ySign, ref bool resultSign)
+ pure nothrow @safe if (op == "|" || op == "^" || op == "&")
+ {
+ auto d1 = includeSign(x.data, y.uintLength, xSign);
+ auto d2 = includeSign(y.data, x.uintLength, ySign);
+
+ foreach (i; 0 .. d1.length)
+ {
+ mixin("d1[i] " ~ op ~ "= d2[i];");
+ }
+
+ mixin("resultSign = xSign " ~ op ~ " ySign;");
+
+ if (resultSign)
+ {
+ twosComplement(d1, d1);
+ }
+
+ return BigUint(removeLeadingZeros(trustedAssumeUnique(d1)));
+ }
+
+ /**
+ * Return a BigUint which is x raised to the power of y.
+ * Method: Powers of 2 are removed from x, then left-to-right binary
+ * exponentiation is used.
+ * Memory allocation is minimized: at most one temporary BigUint is used.
+ */
+ static BigUint pow(BigUint x, ulong y) pure nothrow
+ {
+ // Deal with the degenerate cases first.
+ if (y == 0) return BigUint(ONE);
+ if (y == 1) return x;
+ if (x == 0 || x == 1) return x;
+
+ BigUint result;
+
+ // Simplify, step 1: Remove all powers of 2.
+ uint firstnonzero = firstNonZeroDigit(x.data);
+ // Now we know x = x[firstnonzero..$] * (2^^(firstnonzero*BigDigitBits))
+ // where BigDigitBits = BigDigit.sizeof * 8
+
+ // See if x[firstnonzero..$] can now fit into a single digit.
+ bool singledigit = ((x.data.length - firstnonzero) == 1);
+ // If true, then x0 is that digit
+ // and the result will be (x0 ^^ y) * (2^^(firstnonzero*y*BigDigitBits))
+ BigDigit x0 = x.data[firstnonzero];
+ assert(x0 != 0);
+ // Length of the non-zero portion
+ size_t nonzerolength = x.data.length - firstnonzero;
+ ulong y0;
+ uint evenbits = 0; // number of even bits in the bottom of x
+ while (!(x0 & 1))
+ {
+ x0 >>= 1;
+ ++evenbits;
+ }
+
+ if (x.data.length- firstnonzero == 2)
+ {
+ // Check for a single digit straddling a digit boundary
+ const BigDigit x1 = x.data[firstnonzero+1];
+ if ((x1 >> evenbits) == 0)
+ {
+ x0 |= (x1 << (BigDigit.sizeof * 8 - evenbits));
+ singledigit = true;
+ }
+ }
+ // Now if (singledigit), x^^y = (x0 ^^ y) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
+
+ uint evenshiftbits = 0; // Total powers of 2 to shift by, at the end
+
+ // Simplify, step 2: For singledigits, see if we can trivially reduce y
+
+ BigDigit finalMultiplier = 1UL;
+
+ if (singledigit)
+ {
+ // x fits into a single digit. Raise it to the highest power we can
+ // that still fits into a single digit, then reduce the exponent accordingly.
+ // We're quite likely to have a residual multiply at the end.
+ // For example, 10^^100 = (((5^^13)^^7) * 5^^9) * 2^^100.
+ // and 5^^13 still fits into a uint.
+ evenshiftbits = cast(uint)( (evenbits * y) & BIGDIGITSHIFTMASK);
+ if (x0 == 1)
+ { // Perfect power of 2
+ result = 1UL;
+ return result << (evenbits + firstnonzero * 8 * BigDigit.sizeof) * y;
+ }
+ immutable p = highestPowerBelowUintMax(x0);
+ if (y <= p)
+ { // Just do it with pow
+ result = cast(ulong) intpow(x0, y);
+ if (evenbits + firstnonzero == 0)
+ return result;
+ return result << (evenbits + firstnonzero * 8 * BigDigit.sizeof) * y;
+ }
+ y0 = y / p;
+ finalMultiplier = intpow(x0, y - y0*p);
+ x0 = intpow(x0, p);
+ // Result is x0
+ nonzerolength = 1;
+ }
+ // Now if (singledigit), x^^y = finalMultiplier * (x0 ^^ y0) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
+
+ // Perform a crude check for overflow and allocate result buffer.
+ // The length required is y * lg2(x) bits.
+ // which will always fit into y*x.length digits. But this is
+ // a gross overestimate if x is small (length 1 or 2) and the highest
+ // digit is nearly empty.
+ // A better estimate is:
+ // y * lg2(x[$-1]/BigDigit.max) + y * (x.length - 1) digits,
+ // and the first term is always between
+ // y * (bsr(x.data[$-1]) + 1) / BIGDIGITBITS and
+ // y * (bsr(x.data[$-1]) + 2) / BIGDIGITBITS
+ // For single digit payloads, we already have
+ // x^^y = finalMultiplier * (x0 ^^ y0) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
+ // and x0 is almost a full digit, so it's a tight estimate.
+ // Number of digits is therefore 1 + x0.length*y0 + (evenbits*y)/BIGDIGIT + firstnonzero*y
+ // Note that the divisions must be rounded up.
+
+ // Estimated length in BigDigits
+ immutable estimatelength = singledigit
+ ? 1 + y0 + ((evenbits*y + BigDigit.sizeof * 8 - 1) / (BigDigit.sizeof *8)) + firstnonzero*y
+ : x.data.length * y;
+ // Imprecise check for overflow. Makes the extreme cases easier to debug
+ // (less extreme overflow will result in an out of memory error).
+ if (estimatelength > uint.max/(4*BigDigit.sizeof))
+ assert(0, "Overflow in BigInt.pow");
+
+ // The result buffer includes space for all the trailing zeros
+ BigDigit [] resultBuffer = new BigDigit[cast(size_t) estimatelength];
+
+ // Do all the powers of 2!
+ size_t result_start = cast(size_t)( firstnonzero * y
+ + (singledigit ? ((evenbits * y) >> LG2BIGDIGITBITS) : 0));
+
+ resultBuffer[0 .. result_start] = 0;
+ BigDigit [] t1 = resultBuffer[result_start..$];
+ BigDigit [] r1;
+
+ if (singledigit)
+ {
+ r1 = t1[0 .. 1];
+ r1[0] = x0;
+ y = y0;
+ }
+ else
+ {
+ // It's not worth right shifting by evenbits unless we also shrink the length after each
+ // multiply or squaring operation. That might still be worthwhile for large y.
+ r1 = t1[0 .. x.data.length - firstnonzero];
+ r1[0..$] = x.data[firstnonzero..$];
+ }
+
+ if (y>1)
+ { // Set r1 = r1 ^^ y.
+ // The secondary buffer only needs space for the multiplication results
+ BigDigit [] t2 = new BigDigit[resultBuffer.length - result_start];
+ BigDigit [] r2;
+
+ int shifts = 63; // num bits in a long
+ while (!(y & 0x8000_0000_0000_0000L))
+ {
+ y <<= 1;
+ --shifts;
+ }
+ y <<=1;
+
+ while (y != 0)
+ {
+ // For each bit of y: Set r1 = r1 * r1
+ // If the bit is 1, set r1 = r1 * x
+ // Eg, if y is 0b101, result = ((x^^2)^^2)*x == x^^5.
+ // Optimization opportunity: if more than 2 bits in y are set,
+ // it's usually possible to reduce the number of multiplies
+ // by caching odd powers of x. eg for y = 54,
+ // (0b110110), set u = x^^3, and result is ((u^^8)*u)^^2
+ r2 = t2[0 .. r1.length*2];
+ squareInternal(r2, r1);
+ if (y & 0x8000_0000_0000_0000L)
+ {
+ r1 = t1[0 .. r2.length + nonzerolength];
+ if (singledigit)
+ {
+ r1[$-1] = multibyteMul(r1[0 .. $-1], r2, x0, 0);
+ }
+ else
+ {
+ mulInternal(r1, r2, x.data[firstnonzero..$]);
+ }
+ }
+ else
+ {
+ r1 = t1[0 .. r2.length];
+ r1[] = r2[];
+ }
+ y <<=1;
+ shifts--;
+ }
+ while (shifts>0)
+ {
+ r2 = t2[0 .. r1.length * 2];
+ squareInternal(r2, r1);
+ r1 = t1[0 .. r2.length];
+ r1[] = r2[];
+ --shifts;
+ }
+ }
+
+ if (finalMultiplier != 1)
+ {
+ const BigDigit carry = multibyteMul(r1, r1, finalMultiplier, 0);
+ if (carry)
+ {
+ r1 = t1[0 .. r1.length + 1];
+ r1[$-1] = carry;
+ }
+ }
+ if (evenshiftbits)
+ {
+ const BigDigit carry = multibyteShl(r1, r1, evenshiftbits);
+ if (carry != 0)
+ {
+ r1 = t1[0 .. r1.length + 1];
+ r1[$ - 1] = carry;
+ }
+ }
+ while (r1[$ - 1]==0)
+ {
+ r1=r1[0 .. $ - 1];
+ }
+ return BigUint(trustedAssumeUnique(resultBuffer[0 .. result_start + r1.length]));
+ }
+
+ // Implement toHash so that BigUint works properly as an AA key.
+ size_t toHash() const @trusted nothrow
+ {
+ return typeid(data).getHash(&data);
+ }
+
+} // end BigUint
+
+@safe pure nothrow unittest
+{
+ // ulong comparison test
+ BigUint a = [1];
+ assert(a == 1);
+ assert(a < 0x8000_0000_0000_0000UL); // bug 9548
+
+ // bug 12234
+ BigUint z = [0];
+ assert(z == 0UL);
+ assert(!(z > 0UL));
+ assert(!(z < 0UL));
+}
+
+// Remove leading zeros from x, to restore the BigUint invariant
+inout(BigDigit) [] removeLeadingZeros(inout(BigDigit) [] x) pure nothrow @safe
+{
+ size_t k = x.length;
+ while (k>1 && x[k - 1]==0) --k;
+ return x[0 .. k];
+}
+
+pure @system unittest
+{
+ BigUint r = BigUint([5]);
+ BigUint t = BigUint([7]);
+ BigUint s = BigUint.mod(r, t);
+ assert(s == 5);
+}
+
+
+@safe pure unittest
+{
+ BigUint r;
+ r = 5UL;
+ assert(r.peekUlong(0) == 5UL);
+ assert(r.peekUint(0) == 5U);
+ r = 0x1234_5678_9ABC_DEF0UL;
+ assert(r.peekUlong(0) == 0x1234_5678_9ABC_DEF0UL);
+ assert(r.peekUint(0) == 0x9ABC_DEF0U);
+}
+
+
+// Pow tests
+pure @system unittest
+{
+ BigUint r, s;
+ r.fromHexString("80000000_00000001");
+ s = BigUint.pow(r, 5);
+ r.fromHexString("08000000_00000000_50000000_00000001_40000000_00000002_80000000"
+ ~ "_00000002_80000000_00000001");
+ assert(s == r);
+ s = 10UL;
+ s = BigUint.pow(s, 39);
+ r.fromDecimalString("1000000000000000000000000000000000000000");
+ assert(s == r);
+ r.fromHexString("1_E1178E81_00000000");
+ s = BigUint.pow(r, 15); // Regression test: this used to overflow array bounds
+
+ r.fromDecimalString("000_000_00");
+ assert(r == 0);
+ r.fromDecimalString("0007");
+ assert(r == 7);
+ r.fromDecimalString("0");
+ assert(r == 0);
+}
+
+// Radix conversion tests
+@safe pure unittest
+{
+ BigUint r;
+ r.fromHexString("1_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 0) == "1_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 20) == "0001_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 16+8) == "00000001_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 16+9) == "0_00000001_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8) == "00000000_00000001_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8+1) == "0_00000000_00000001_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8+1, ' ') == " 1_E1178E81_00000000");
+ assert(r.toHexString(0, 0, 16+8+8+1) == "00000000000000001E1178E8100000000");
+ r = 0UL;
+ assert(r.toHexString(0, '_', 0) == "0");
+ assert(r.toHexString(0, '_', 7) == "0000000");
+ assert(r.toHexString(0, '_', 7, ' ') == " 0");
+ assert(r.toHexString(0, '#', 9) == "0#00000000");
+ assert(r.toHexString(0, 0, 9) == "000000000");
+}
+
+//
+@safe pure unittest
+{
+ BigUint r;
+ r.fromHexString("1_E1178E81_00000000");
+ assert(r.toHexString(0, '_', 0, '0', LetterCase.lower) == "1_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 20, '0', LetterCase.lower) == "0001_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 16+8, '0', LetterCase.lower) == "00000001_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 16+9, '0', LetterCase.lower) == "0_00000001_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8, '0', LetterCase.lower) == "00000000_00000001_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8+1, '0', LetterCase.lower) == "0_00000000_00000001_e1178e81_00000000");
+ assert(r.toHexString(0, '_', 16+8+8+1, ' ', LetterCase.lower) == " 1_e1178e81_00000000");
+ assert(r.toHexString(0, 0, 16+8+8+1, '0', LetterCase.lower) == "00000000000000001e1178e8100000000");
+ r = 0UL;
+ assert(r.toHexString(0, '_', 0, '0', LetterCase.lower) == "0");
+ assert(r.toHexString(0, '_', 7, '0', LetterCase.lower) == "0000000");
+ assert(r.toHexString(0, '_', 7, ' ', LetterCase.lower) == " 0");
+ assert(r.toHexString(0, '#', 9, '0', LetterCase.lower) == "0#00000000");
+ assert(r.toHexString(0, 'Z', 9, '0', LetterCase.lower) == "0Z00000000");
+ assert(r.toHexString(0, 0, 9, '0', LetterCase.lower) == "000000000");
+}
+
+
+private:
+void twosComplement(const(BigDigit) [] x, BigDigit[] result)
+pure nothrow @safe
+{
+ foreach (i; 0 .. x.length)
+ {
+ result[i] = ~x[i];
+ }
+ result[x.length..$] = BigDigit.max;
+
+ foreach (i; 0 .. result.length)
+ {
+ if (result[i] == BigDigit.max)
+ {
+ result[i] = 0;
+ }
+ else
+ {
+ result[i] += 1;
+ break;
+ }
+ }
+}
+
+// Encode BigInt as BigDigit array (sign and 2's complement)
+BigDigit[] includeSign(const(BigDigit) [] x, size_t minSize, bool sign)
+pure nothrow @safe
+{
+ size_t length = (x.length > minSize) ? x.length : minSize;
+ BigDigit [] result = new BigDigit[length];
+ if (sign)
+ {
+ twosComplement(x, result);
+ }
+ else
+ {
+ result[0 .. x.length] = x;
+ }
+ return result;
+}
+
+// works for any type
+T intpow(T)(T x, ulong n) pure nothrow @safe
+{
+ T p;
+
+ switch (n)
+ {
+ case 0:
+ p = 1;
+ break;
+
+ case 1:
+ p = x;
+ break;
+
+ case 2:
+ p = x * x;
+ break;
+
+ default:
+ p = 1;
+ while (1)
+ {
+ if (n & 1)
+ p *= x;
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ }
+ break;
+ }
+ return p;
+}
+
+
+// returns the maximum power of x that will fit in a uint.
+int highestPowerBelowUintMax(uint x) pure nothrow @safe
+{
+ assert(x>1);
+ static immutable ubyte [22] maxpwr = [ 31, 20, 15, 13, 12, 11, 10, 10, 9, 9,
+ 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7];
+ if (x<24) return maxpwr[x-2];
+ if (x<41) return 6;
+ if (x<85) return 5;
+ if (x<256) return 4;
+ if (x<1626) return 3;
+ if (x<65_536) return 2;
+ return 1;
+}
+
+// returns the maximum power of x that will fit in a ulong.
+int highestPowerBelowUlongMax(uint x) pure nothrow @safe
+{
+ assert(x>1);
+ static immutable ubyte [39] maxpwr = [ 63, 40, 31, 27, 24, 22, 21, 20, 19, 18,
+ 17, 17, 16, 16, 15, 15, 15, 15, 14, 14,
+ 14, 14, 13, 13, 13, 13, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12];
+ if (x<41) return maxpwr[x-2];
+ if (x<57) return 11;
+ if (x<85) return 10;
+ if (x<139) return 9;
+ if (x<256) return 8;
+ if (x<566) return 7;
+ if (x<1626) return 6;
+ if (x<7132) return 5;
+ if (x<65_536) return 4;
+ if (x<2_642_246) return 3;
+ return 2;
+}
+
+version (unittest)
+{
+
+int slowHighestPowerBelowUintMax(uint x) pure nothrow @safe
+{
+ int pwr = 1;
+ for (ulong q = x;x*q < cast(ulong) uint.max; )
+ {
+ q*=x; ++pwr;
+ }
+ return pwr;
+}
+
+@safe pure unittest
+{
+ assert(highestPowerBelowUintMax(10)==9);
+ for (int k=82; k<88; ++k)
+ {
+ assert(highestPowerBelowUintMax(k)== slowHighestPowerBelowUintMax(k));
+ }
+}
+
+}
+
+
+/* General unsigned subtraction routine for bigints.
+ * Sets result = x - y. If the result is negative, negative will be true.
+ */
+BigDigit [] sub(const BigDigit [] x, const BigDigit [] y, bool *negative)
+pure nothrow
+{
+ if (x.length == y.length)
+ {
+ // There's a possibility of cancellation, if x and y are almost equal.
+ ptrdiff_t last = highestDifferentDigit(x, y);
+ BigDigit [] result = new BigDigit[last+1];
+ if (x[last] < y[last])
+ { // we know result is negative
+ multibyteSub(result[0 .. last+1], y[0 .. last+1], x[0 .. last+1], 0);
+ *negative = true;
+ }
+ else
+ { // positive or zero result
+ multibyteSub(result[0 .. last+1], x[0 .. last+1], y[0 .. last+1], 0);
+ *negative = false;
+ }
+ while (result.length > 1 && result[$-1] == 0)
+ {
+ result = result[0..$-1];
+ }
+// if (result.length >1 && result[$-1]==0) return result[0..$-1];
+ return result;
+ }
+ // Lengths are different
+ const(BigDigit) [] large, small;
+ if (x.length < y.length)
+ {
+ *negative = true;
+ large = y; small = x;
+ }
+ else
+ {
+ *negative = false;
+ large = x; small = y;
+ }
+ // result.length will be equal to larger length, or could decrease by 1.
+
+
+ BigDigit [] result = new BigDigit[large.length];
+ BigDigit carry = multibyteSub(result[0 .. small.length], large[0 .. small.length], small, 0);
+ result[small.length..$] = large[small.length..$];
+ if (carry)
+ {
+ multibyteIncrementAssign!('-')(result[small.length..$], carry);
+ }
+ while (result.length > 1 && result[$-1] == 0)
+ {
+ result = result[0..$-1];
+ }
+ return result;
+}
+
+
+// return a + b
+BigDigit [] add(const BigDigit [] a, const BigDigit [] b) pure nothrow
+{
+ const(BigDigit) [] x, y;
+ if (a.length < b.length)
+ {
+ x = b; y = a;
+ }
+ else
+ {
+ x = a; y = b;
+ }
+ // now we know x.length > y.length
+ // create result. add 1 in case it overflows
+ BigDigit [] result = new BigDigit[x.length + 1];
+
+ BigDigit carry = multibyteAdd(result[0 .. y.length], x[0 .. y.length], y, 0);
+ if (x.length != y.length)
+ {
+ result[y.length..$-1]= x[y.length..$];
+ carry = multibyteIncrementAssign!('+')(result[y.length..$-1], carry);
+ }
+ if (carry)
+ {
+ result[$-1] = carry;
+ return result;
+ }
+ else
+ return result[0..$-1];
+}
+
+/** return x + y
+ */
+BigDigit [] addInt(const BigDigit[] x, ulong y) pure nothrow
+{
+ uint hi = cast(uint)(y >>> 32);
+ uint lo = cast(uint)(y& 0xFFFF_FFFF);
+ auto len = x.length;
+ if (x.length < 2 && hi != 0) ++len;
+ BigDigit [] result = new BigDigit[len+1];
+ result[0 .. x.length] = x[];
+ if (x.length < 2 && hi != 0)
+ {
+ result[1]=hi;
+ hi=0;
+ }
+ uint carry = multibyteIncrementAssign!('+')(result[0..$-1], lo);
+ if (hi != 0) carry += multibyteIncrementAssign!('+')(result[1..$-1], hi);
+ if (carry)
+ {
+ result[$-1] = carry;
+ return result;
+ }
+ else
+ return result[0..$-1];
+}
+
+/** Return x - y.
+ * x must be greater than y.
+ */
+BigDigit [] subInt(const BigDigit[] x, ulong y) pure nothrow
+{
+ uint hi = cast(uint)(y >>> 32);
+ uint lo = cast(uint)(y & 0xFFFF_FFFF);
+ BigDigit [] result = new BigDigit[x.length];
+ result[] = x[];
+ multibyteIncrementAssign!('-')(result[], lo);
+ if (hi)
+ multibyteIncrementAssign!('-')(result[1..$], hi);
+ if (result[$-1] == 0)
+ return result[0..$-1];
+ else
+ return result;
+}
+
+/** General unsigned multiply routine for bigints.
+ * Sets result = x * y.
+ *
+ * The length of y must not be larger than the length of x.
+ * Different algorithms are used, depending on the lengths of x and y.
+ * TODO: "Modern Computer Arithmetic" suggests the OddEvenKaratsuba algorithm for the
+ * unbalanced case. (But I doubt it would be faster in practice).
+ *
+ */
+void mulInternal(BigDigit[] result, const(BigDigit)[] x, const(BigDigit)[] y)
+ pure nothrow
+{
+ import core.memory : GC;
+ assert( result.length == x.length + y.length );
+ assert( y.length > 0 );
+ assert( x.length >= y.length);
+ if (y.length <= KARATSUBALIMIT)
+ {
+ // Small multiplier, we'll just use the asm classic multiply.
+ if (y.length == 1)
+ { // Trivial case, no cache effects to worry about
+ result[x.length] = multibyteMul(result[0 .. x.length], x, y[0], 0);
+ return;
+ }
+
+ if (x.length + y.length < CACHELIMIT)
+ return mulSimple(result, x, y);
+
+ // If x is so big that it won't fit into the cache, we divide it into chunks
+ // Every chunk must be greater than y.length.
+ // We make the first chunk shorter, if necessary, to ensure this.
+
+ auto chunksize = CACHELIMIT / y.length;
+ immutable residual = x.length % chunksize;
+ if (residual < y.length)
+ {
+ chunksize -= y.length;
+ }
+
+ // Use schoolbook multiply.
+ mulSimple(result[0 .. chunksize + y.length], x[0 .. chunksize], y);
+ auto done = chunksize;
+
+ while (done < x.length)
+ {
+ // result[done .. done+ylength] already has a value.
+ chunksize = (done + (CACHELIMIT / y.length) < x.length) ? (CACHELIMIT / y.length) : x.length - done;
+ BigDigit [KARATSUBALIMIT] partial;
+ partial[0 .. y.length] = result[done .. done+y.length];
+ mulSimple(result[done .. done+chunksize+y.length], x[done .. done+chunksize], y);
+ addAssignSimple(result[done .. done+chunksize + y.length], partial[0 .. y.length]);
+ done += chunksize;
+ }
+ return;
+ }
+
+ immutable half = (x.length >> 1) + (x.length & 1);
+ if (2*y.length*y.length <= x.length*x.length)
+ {
+ // UNBALANCED MULTIPLY
+ // Use school multiply to cut into quasi-squares of Karatsuba-size
+ // or larger. The ratio of the two sides of the 'square' must be
+ // between 1.414:1 and 1:1. Use Karatsuba on each chunk.
+ //
+ // For maximum performance, we want the ratio to be as close to
+ // 1:1 as possible. To achieve this, we can either pad x or y.
+ // The best choice depends on the modulus x%y.
+ auto numchunks = x.length / y.length;
+ auto chunksize = y.length;
+ auto extra = x.length % y.length;
+ auto maxchunk = chunksize + extra;
+ bool paddingY; // true = we're padding Y, false = we're padding X.
+ if (extra * extra * 2 < y.length*y.length)
+ {
+ // The leftover bit is small enough that it should be incorporated
+ // in the existing chunks.
+ // Make all the chunks a tiny bit bigger
+ // (We're padding y with zeros)
+ chunksize += extra / numchunks;
+ extra = x.length - chunksize*numchunks;
+ // there will probably be a few left over.
+ // Every chunk will either have size chunksize, or chunksize+1.
+ maxchunk = chunksize + 1;
+ paddingY = true;
+ assert(chunksize + extra + chunksize *(numchunks-1) == x.length );
+ }
+ else
+ {
+ // the extra bit is large enough that it's worth making a new chunk.
+ // (This means we're padding x with zeros, when doing the first one).
+ maxchunk = chunksize;
+ ++numchunks;
+ paddingY = false;
+ assert(extra + chunksize *(numchunks-1) == x.length );
+ }
+ // We make the buffer a bit bigger so we have space for the partial sums.
+ BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(maxchunk) + y.length];
+ BigDigit [] partial = scratchbuff[$ - y.length .. $];
+ size_t done; // how much of X have we done so far?
+ if (paddingY)
+ {
+ // If the first chunk is bigger, do it first. We're padding y.
+ mulKaratsuba(result[0 .. y.length + chunksize + (extra > 0 ? 1 : 0 )],
+ x[0 .. chunksize + (extra>0?1:0)], y, scratchbuff);
+ done = chunksize + (extra > 0 ? 1 : 0);
+ if (extra) --extra;
+ }
+ else
+ { // We're padding X. Begin with the extra bit.
+ mulKaratsuba(result[0 .. y.length + extra], y, x[0 .. extra], scratchbuff);
+ done = extra;
+ extra = 0;
+ }
+ immutable basechunksize = chunksize;
+ while (done < x.length)
+ {
+ chunksize = basechunksize + (extra > 0 ? 1 : 0);
+ if (extra) --extra;
+ partial[] = result[done .. done+y.length];
+ mulKaratsuba(result[done .. done + y.length + chunksize],
+ x[done .. done+chunksize], y, scratchbuff);
+ addAssignSimple(result[done .. done + y.length + chunksize], partial);
+ done += chunksize;
+ }
+ () @trusted { GC.free(scratchbuff.ptr); } ();
+ }
+ else
+ {
+ // Balanced. Use Karatsuba directly.
+ BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
+ mulKaratsuba(result, x, y, scratchbuff);
+ () @trusted { GC.free(scratchbuff.ptr); } ();
+ }
+}
+
+/** General unsigned squaring routine for BigInts.
+ * Sets result = x*x.
+ * NOTE: If the highest half-digit of x is zero, the highest digit of result will
+ * also be zero.
+ */
+void squareInternal(BigDigit[] result, const BigDigit[] x) pure nothrow
+{
+ import core.memory : GC;
+ // Squaring is potentially half a multiply, plus add the squares of
+ // the diagonal elements.
+ assert(result.length == 2*x.length);
+ if (x.length <= KARATSUBASQUARELIMIT)
+ {
+ if (x.length == 1)
+ {
+ result[1] = multibyteMul(result[0 .. 1], x, x[0], 0);
+ return;
+ }
+ return squareSimple(result, x);
+ }
+ // The nice thing about squaring is that it always stays balanced
+ BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
+ squareKaratsuba(result, x, scratchbuff);
+ () @trusted { GC.free(scratchbuff.ptr); } ();
+}
+
+
+import core.bitop : bsr;
+
+/// if remainder is null, only calculate quotient.
+void divModInternal(BigDigit [] quotient, BigDigit[] remainder, const BigDigit [] u,
+ const BigDigit [] v) pure nothrow
+{
+ import core.memory : GC;
+ assert(quotient.length == u.length - v.length + 1);
+ assert(remainder == null || remainder.length == v.length);
+ assert(v.length > 1);
+ assert(u.length >= v.length);
+
+ // Normalize by shifting v left just enough so that
+ // its high-order bit is on, and shift u left the
+ // same amount. The highest bit of u will never be set.
+
+ BigDigit [] vn = new BigDigit[v.length];
+ BigDigit [] un = new BigDigit[u.length + 1];
+ // How much to left shift v, so that its MSB is set.
+ uint s = BIGDIGITSHIFTMASK - bsr(v[$-1]);
+ if (s != 0)
+ {
+ multibyteShl(vn, v, s);
+ un[$-1] = multibyteShl(un[0..$-1], u, s);
+ }
+ else
+ {
+ vn[] = v[];
+ un[0..$-1] = u[];
+ un[$-1] = 0;
+ }
+ if (quotient.length<FASTDIVLIMIT)
+ {
+ schoolbookDivMod(quotient, un, vn);
+ }
+ else
+ {
+ blockDivMod(quotient, un, vn);
+ }
+
+ // Unnormalize remainder, if required.
+ if (remainder != null)
+ {
+ if (s == 0) remainder[] = un[0 .. vn.length];
+ else multibyteShr(remainder, un[0 .. vn.length+1], s);
+ }
+ () @trusted { GC.free(un.ptr); GC.free(vn.ptr); } ();
+}
+
+pure @system unittest
+{
+ immutable(uint) [] u = [0, 0xFFFF_FFFE, 0x8000_0000];
+ immutable(uint) [] v = [0xFFFF_FFFF, 0x8000_0000];
+ uint [] q = new uint[u.length - v.length + 1];
+ uint [] r = new uint[2];
+ divModInternal(q, r, u, v);
+ assert(q[]==[0xFFFF_FFFFu, 0]);
+ assert(r[]==[0xFFFF_FFFFu, 0x7FFF_FFFF]);
+ u = [0, 0xFFFF_FFFE, 0x8000_0001];
+ v = [0xFFFF_FFFF, 0x8000_0000];
+ divModInternal(q, r, u, v);
+}
+
+
+private:
+// Converts a big uint to a hexadecimal string.
+//
+// Optionally, a separator character (eg, an underscore) may be added between
+// every 8 digits.
+// buff.length must be data.length*8 if separator is zero,
+// or data.length*9 if separator is non-zero. It will be completely filled.
+char [] biguintToHex(char [] buff, const BigDigit [] data, char separator=0,
+ LetterCase letterCase = LetterCase.upper) pure nothrow @safe
+{
+ int x=0;
+ for (ptrdiff_t i=data.length - 1; i >= 0; --i)
+ {
+ toHexZeroPadded(buff[x .. x+8], data[i], letterCase);
+ x+=8;
+ if (separator)
+ {
+ if (i>0) buff[x] = separator;
+ ++x;
+ }
+ }
+ return buff;
+}
+
+/**
+ * Convert a big uint into an octal string.
+ *
+ * Params:
+ * buff = The destination buffer for the octal string. Must be large enough to
+ * store the result, including leading zeroes, which is
+ * ceil(data.length * BigDigitBits / 3) characters.
+ * The buffer is filled from back to front, starting from `buff[$-1]`.
+ * data = The biguint to be converted.
+ *
+ * Returns: The index of the leading non-zero digit in `buff`. Will be
+ * `buff.length - 1` if the entire big uint is zero.
+ */
+size_t biguintToOctal(char[] buff, const(BigDigit)[] data)
+ pure nothrow @safe @nogc
+{
+ ubyte carry = 0;
+ int shift = 0;
+ size_t penPos = buff.length - 1;
+ size_t lastNonZero = buff.length - 1;
+
+ pragma(inline) void output(uint digit) @nogc nothrow
+ {
+ if (digit != 0)
+ lastNonZero = penPos;
+ buff[penPos--] = cast(char)('0' + digit);
+ }
+
+ foreach (bigdigit; data)
+ {
+ if (shift < 0)
+ {
+ // Some bits were carried over from previous word.
+ assert(shift > -3);
+ output(((bigdigit << -shift) | carry) & 0b111);
+ shift += 3;
+ assert(shift > 0);
+ }
+
+ while (shift <= BigDigitBits - 3)
+ {
+ output((bigdigit >>> shift) & 0b111);
+ shift += 3;
+ }
+
+ if (shift < BigDigitBits)
+ {
+ // Some bits need to be carried forward.
+ carry = (bigdigit >>> shift) & 0b11;
+ }
+ shift -= BigDigitBits;
+ assert(shift >= -2 && shift <= 0);
+ }
+
+ if (shift < 0)
+ {
+ // Last word had bits that haven't been output yet.
+ assert(shift > -3);
+ output(carry);
+ }
+
+ return lastNonZero;
+}
+
+/** Convert a big uint into a decimal string.
+ *
+ * Params:
+ * data The biguint to be converted. Will be destroyed.
+ * buff The destination buffer for the decimal string. Must be
+ * large enough to store the result, including leading zeros.
+ * Will be filled backwards, starting from buff[$-1].
+ *
+ * buff.length must be >= (data.length*32)/log2(10) = 9.63296 * data.length.
+ * Returns:
+ * the lowest index of buff which was used.
+ */
+size_t biguintToDecimal(char [] buff, BigDigit [] data) pure nothrow
+{
+ ptrdiff_t sofar = buff.length;
+ // Might be better to divide by (10^38/2^32) since that gives 38 digits for
+ // the price of 3 divisions and a shr; this version only gives 27 digits
+ // for 3 divisions.
+ while (data.length>1)
+ {
+ uint rem = multibyteDivAssign(data, 10_0000_0000, 0);
+ itoaZeroPadded(buff[sofar-9 .. sofar], rem);
+ sofar -= 9;
+ if (data[$-1] == 0 && data.length > 1)
+ {
+ data.length = data.length - 1;
+ }
+ }
+ itoaZeroPadded(buff[sofar-10 .. sofar], data[0]);
+ sofar -= 10;
+ // and strip off the leading zeros
+ while (sofar != buff.length-1 && buff[sofar] == '0')
+ sofar++;
+ return sofar;
+}
+
+/** Convert a decimal string into a big uint.
+ *
+ * Params:
+ * data The biguint to be receive the result. Must be large enough to
+ * store the result.
+ * s The decimal string. May contain _ or 0 .. 9
+ *
+ * The required length for the destination buffer is slightly less than
+ * 1 + s.length/log2(10) = 1 + s.length/3.3219.
+ *
+ * Returns:
+ * the highest index of data which was used.
+ */
+int biguintFromDecimal(Range)(BigDigit[] data, Range s)
+if (
+ isInputRange!Range &&
+ isSomeChar!(ElementType!Range) &&
+ !isInfinite!Range)
+in
+{
+ static if (hasLength!Range)
+ assert((data.length >= 2) || (data.length == 1 && s.length == 1));
+}
+body
+{
+ import std.conv : ConvException;
+
+ // Convert to base 1e19 = 10_000_000_000_000_000_000.
+ // (this is the largest power of 10 that will fit into a long).
+ // The length will be less than 1 + s.length/log2(10) = 1 + s.length/3.3219.
+ // 485 bits will only just fit into 146 decimal digits.
+ // As we convert the string, we record the number of digits we've seen in base 19:
+ // hi is the number of digits/19, lo is the extra digits (0 to 18).
+ // TODO: This is inefficient for very large strings (it is O(n^^2)).
+ // We should take advantage of fast multiplication once the numbers exceed
+ // Karatsuba size.
+ uint lo = 0; // number of powers of digits, 0 .. 18
+ uint x = 0;
+ ulong y = 0;
+ uint hi = 0; // number of base 1e19 digits
+ data[0] = 0; // initially number is 0.
+ if (data.length > 1)
+ data[1] = 0;
+
+ foreach (character; s)
+ {
+ if (character == '_')
+ continue;
+
+ if (character < '0' || character > '9')
+ throw new ConvException("invalid digit");
+ x *= 10;
+ x += character - '0';
+ ++lo;
+ if (lo == 9)
+ {
+ y = x;
+ x = 0;
+ }
+ if (lo == 18)
+ {
+ y *= 10_0000_0000;
+ y += x;
+ x = 0;
+ }
+ if (lo == 19)
+ {
+ y *= 10;
+ y += x;
+ x = 0;
+ // Multiply existing number by 10^19, then add y1.
+ if (hi>0)
+ {
+ data[hi] = multibyteMul(data[0 .. hi], data[0 .. hi], 1_220_703_125*2u, 0); // 5^13*2 = 0x9184_E72A
+ ++hi;
+ data[hi] = multibyteMul(data[0 .. hi], data[0 .. hi], 15_625*262_144u, 0); // 5^6*2^18 = 0xF424_0000
+ ++hi;
+ }
+ else
+ hi = 2;
+ uint c = multibyteIncrementAssign!('+')(data[0 .. hi], cast(uint)(y&0xFFFF_FFFF));
+ c += multibyteIncrementAssign!('+')(data[1 .. hi], cast(uint)(y >> 32));
+ if (c != 0)
+ {
+ data[hi]=c;
+ ++hi;
+ }
+ y = 0;
+ lo = 0;
+ }
+ }
+ // Now set y = all remaining digits.
+ if (lo >= 18)
+ {
+ }
+ else if (lo >= 9)
+ {
+ for (int k=9; k<lo; ++k) y*=10;
+ y+=x;
+ }
+ else
+ {
+ for (int k=0; k<lo; ++k) y*=10;
+ y+=x;
+ }
+ if (lo != 0)
+ {
+ if (hi == 0)
+ {
+ data[0] = cast(uint) y;
+ if (data.length == 1)
+ {
+ hi = 1;
+ }
+ else
+ {
+ data[1] = cast(uint)(y >>> 32);
+ hi=2;
+ }
+ }
+ else
+ {
+ while (lo>0)
+ {
+ immutable c = multibyteMul(data[0 .. hi], data[0 .. hi], 10, 0);
+ if (c != 0)
+ {
+ data[hi]=c;
+ ++hi;
+ }
+ --lo;
+ }
+ uint c = multibyteIncrementAssign!('+')(data[0 .. hi], cast(uint)(y&0xFFFF_FFFF));
+ if (y > 0xFFFF_FFFFL)
+ {
+ c += multibyteIncrementAssign!('+')(data[1 .. hi], cast(uint)(y >> 32));
+ }
+ if (c != 0)
+ {
+ data[hi]=c;
+ ++hi;
+ }
+ }
+ }
+ while (hi>1 && data[hi-1]==0)
+ --hi;
+ return hi;
+}
+
+
+private:
+// ------------------------
+// These in-place functions are only for internal use; they are incompatible
+// with COW.
+
+// Classic 'schoolbook' multiplication.
+void mulSimple(BigDigit[] result, const(BigDigit) [] left,
+ const(BigDigit)[] right) pure nothrow
+in
+{
+ assert(result.length == left.length + right.length);
+ assert(right.length>1);
+}
+body
+{
+ result[left.length] = multibyteMul(result[0 .. left.length], left, right[0], 0);
+ multibyteMultiplyAccumulate(result[1..$], left, right[1..$]);
+}
+
+// Classic 'schoolbook' squaring
+void squareSimple(BigDigit[] result, const(BigDigit) [] x) pure nothrow
+in
+{
+ assert(result.length == 2*x.length);
+ assert(x.length>1);
+}
+body
+{
+ multibyteSquare(result, x);
+}
+
+
+// add two uints of possibly different lengths. Result must be as long
+// as the larger length.
+// Returns carry (0 or 1).
+uint addSimple(BigDigit[] result, const BigDigit [] left, const BigDigit [] right)
+pure nothrow
+in
+{
+ assert(result.length == left.length);
+ assert(left.length >= right.length);
+ assert(right.length>0);
+}
+body
+{
+ uint carry = multibyteAdd(result[0 .. right.length],
+ left[0 .. right.length], right, 0);
+ if (right.length < left.length)
+ {
+ result[right.length .. left.length] = left[right.length .. $];
+ carry = multibyteIncrementAssign!('+')(result[right.length..$], carry);
+ }
+ return carry;
+}
+
+// result = left - right
+// returns carry (0 or 1)
+BigDigit subSimple(BigDigit [] result,const(BigDigit) [] left,
+ const(BigDigit) [] right) pure nothrow
+in
+{
+ assert(result.length == left.length);
+ assert(left.length >= right.length);
+ assert(right.length>0);
+}
+body
+{
+ BigDigit carry = multibyteSub(result[0 .. right.length],
+ left[0 .. right.length], right, 0);
+ if (right.length < left.length)
+ {
+ result[right.length .. left.length] = left[right.length .. $];
+ carry = multibyteIncrementAssign!('-')(result[right.length..$], carry);
+ } //else if (result.length == left.length+1) { result[$-1] = carry; carry=0; }
+ return carry;
+}
+
+
+/* result = result - right
+ * Returns carry = 1 if result was less than right.
+*/
+BigDigit subAssignSimple(BigDigit [] result, const(BigDigit) [] right)
+pure nothrow
+{
+ assert(result.length >= right.length);
+ uint c = multibyteSub(result[0 .. right.length], result[0 .. right.length], right, 0);
+ if (c && result.length > right.length)
+ c = multibyteIncrementAssign!('-')(result[right.length .. $], c);
+ return c;
+}
+
+/* result = result + right
+*/
+BigDigit addAssignSimple(BigDigit [] result, const(BigDigit) [] right)
+pure nothrow
+{
+ assert(result.length >= right.length);
+ uint c = multibyteAdd(result[0 .. right.length], result[0 .. right.length], right, 0);
+ if (c && result.length > right.length)
+ c = multibyteIncrementAssign!('+')(result[right.length .. $], c);
+ return c;
+}
+
+/* performs result += wantSub? - right : right;
+*/
+BigDigit addOrSubAssignSimple(BigDigit [] result, const(BigDigit) [] right,
+ bool wantSub) pure nothrow
+{
+ if (wantSub)
+ return subAssignSimple(result, right);
+ else
+ return addAssignSimple(result, right);
+}
+
+
+// return true if x<y, considering leading zeros
+bool less(const(BigDigit)[] x, const(BigDigit)[] y) pure nothrow
+{
+ assert(x.length >= y.length);
+ auto k = x.length-1;
+ while (x[k]==0 && k >= y.length)
+ --k;
+ if (k >= y.length)
+ return false;
+ while (k>0 && x[k]==y[k])
+ --k;
+ return x[k] < y[k];
+}
+
+// Set result = abs(x-y), return true if result is negative(x<y), false if x <= y.
+bool inplaceSub(BigDigit[] result, const(BigDigit)[] x, const(BigDigit)[] y)
+ pure nothrow
+{
+ assert(result.length == (x.length >= y.length) ? x.length : y.length);
+
+ size_t minlen;
+ bool negative;
+ if (x.length >= y.length)
+ {
+ minlen = y.length;
+ negative = less(x, y);
+ }
+ else
+ {
+ minlen = x.length;
+ negative = !less(y, x);
+ }
+ const (BigDigit)[] large, small;
+ if (negative)
+ {
+ large = y; small = x;
+ }
+ else
+ {
+ large = x; small = y;
+ }
+
+ BigDigit carry = multibyteSub(result[0 .. minlen], large[0 .. minlen], small[0 .. minlen], 0);
+ if (x.length != y.length)
+ {
+ result[minlen .. large.length]= large[minlen..$];
+ result[large.length..$] = 0;
+ if (carry)
+ multibyteIncrementAssign!('-')(result[minlen..$], carry);
+ }
+ return negative;
+}
+
+/* Determine how much space is required for the temporaries
+ * when performing a Karatsuba multiplication.
+ */
+size_t karatsubaRequiredBuffSize(size_t xlen) pure nothrow @safe
+{
+ return xlen <= KARATSUBALIMIT ? 0 : 2*xlen; // - KARATSUBALIMIT+2;
+}
+
+/* Sets result = x*y, using Karatsuba multiplication.
+* x must be longer or equal to y.
+* Valid only for balanced multiplies, where x is not shorter than y.
+* It is superior to schoolbook multiplication if and only if
+* sqrt(2)*y.length > x.length > y.length.
+* Karatsuba multiplication is O(n^1.59), whereas schoolbook is O(n^2)
+* The maximum allowable length of x and y is uint.max; but better algorithms
+* should be used far before that length is reached.
+* Params:
+* scratchbuff An array long enough to store all the temporaries. Will be destroyed.
+*/
+void mulKaratsuba(BigDigit [] result, const(BigDigit) [] x,
+ const(BigDigit)[] y, BigDigit [] scratchbuff) pure nothrow
+{
+ assert(x.length >= y.length);
+ assert(result.length < uint.max, "Operands too large");
+ assert(result.length == x.length + y.length);
+ if (x.length <= KARATSUBALIMIT)
+ {
+ return mulSimple(result, x, y);
+ }
+ // Must be almost square (otherwise, a schoolbook iteration is better)
+ assert(2L * y.length * y.length > (x.length-1) * (x.length-1),
+ "Bigint Internal Error: Asymmetric Karatsuba");
+
+ // The subtractive version of Karatsuba multiply uses the following result:
+ // (Nx1 + x0)*(Ny1 + y0) = (N*N)*x1y1 + x0y0 + N * (x0y0 + x1y1 - mid)
+ // where mid = (x0-x1)*(y0-y1)
+ // requiring 3 multiplies of length N, instead of 4.
+ // The advantage of the subtractive over the additive version is that
+ // the mid multiply cannot exceed length N. But there are subtleties:
+ // (x0-x1),(y0-y1) may be negative or zero. To keep it simple, we
+ // retain all of the leading zeros in the subtractions
+
+ // half length, round up.
+ auto half = (x.length >> 1) + (x.length & 1);
+
+ const(BigDigit) [] x0 = x[0 .. half];
+ const(BigDigit) [] x1 = x[half .. $];
+ const(BigDigit) [] y0 = y[0 .. half];
+ const(BigDigit) [] y1 = y[half .. $];
+ BigDigit [] mid = scratchbuff[0 .. half*2];
+ BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
+ BigDigit [] resultLow = result[0 .. 2*half];
+ BigDigit [] resultHigh = result[2*half .. $];
+ // initially use result to store temporaries
+ BigDigit [] xdiff= result[0 .. half];
+ BigDigit [] ydiff = result[half .. half*2];
+
+ // First, we calculate mid, and sign of mid
+ immutable bool midNegative = inplaceSub(xdiff, x0, x1)
+ ^ inplaceSub(ydiff, y0, y1);
+ mulKaratsuba(mid, xdiff, ydiff, newscratchbuff);
+
+ // Low half of result gets x0 * y0. High half gets x1 * y1
+
+ mulKaratsuba(resultLow, x0, y0, newscratchbuff);
+
+ if (2L * y1.length * y1.length < x1.length * x1.length)
+ {
+ // an asymmetric situation has been created.
+ // Worst case is if x:y = 1.414 : 1, then x1:y1 = 2.41 : 1.
+ // Applying one schoolbook multiply gives us two pieces each 1.2:1
+ if (y1.length <= KARATSUBALIMIT)
+ mulSimple(resultHigh, x1, y1);
+ else
+ {
+ // divide x1 in two, then use schoolbook multiply on the two pieces.
+ auto quarter = (x1.length >> 1) + (x1.length & 1);
+ immutable ysmaller = (quarter >= y1.length);
+ mulKaratsuba(resultHigh[0 .. quarter+y1.length], ysmaller ? x1[0 .. quarter] : y1,
+ ysmaller ? y1 : x1[0 .. quarter], newscratchbuff);
+ // Save the part which will be overwritten.
+ immutable ysmaller2 = ((x1.length - quarter) >= y1.length);
+ newscratchbuff[0 .. y1.length] = resultHigh[quarter .. quarter + y1.length];
+ mulKaratsuba(resultHigh[quarter..$], ysmaller2 ? x1[quarter..$] : y1,
+ ysmaller2 ? y1 : x1[quarter..$], newscratchbuff[y1.length..$]);
+
+ resultHigh[quarter..$].addAssignSimple(newscratchbuff[0 .. y1.length]);
+ }
+ }
+ else
+ mulKaratsuba(resultHigh, x1, y1, newscratchbuff);
+
+ /* We now have result = x0y0 + (N*N)*x1y1
+ Before adding or subtracting mid, we must calculate
+ result += N * (x0y0 + x1y1)
+ We can do this with three half-length additions. With a = x0y0, b = x1y1:
+ aHI aLO
+ + aHI aLO
+ + bHI bLO
+ + bHI bLO
+ = R3 R2 R1 R0
+ R1 = aHI + bLO + aLO
+ R2 = aHI + bLO + aHI + carry_from_R1
+ R3 = bHi + carry_from_R2
+
+ It might actually be quicker to do it in two full-length additions:
+ newscratchbuff[2*half] = addSimple(newscratchbuff[0 .. 2*half], result[0 .. 2*half], result[2*half..$]);
+ addAssignSimple(result[half..$], newscratchbuff[0 .. 2*half+1]);
+ */
+ BigDigit[] R1 = result[half .. half*2];
+ BigDigit[] R2 = result[half*2 .. half*3];
+ BigDigit[] R3 = result[half*3..$];
+ BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
+ BigDigit c2 = multibyteAdd(R1, R2, result[0 .. half], 0); // c2:R1 = R2 + R1 + R0
+ BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
+ if (c1+c2)
+ multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
+ if (c1+c3)
+ multibyteIncrementAssign!('+')(R3, c1+c3);
+
+ // And finally we subtract mid
+ addOrSubAssignSimple(result[half..$], mid, !midNegative);
+}
+
+void squareKaratsuba(BigDigit [] result, const BigDigit [] x,
+ BigDigit [] scratchbuff) pure nothrow
+{
+ // See mulKaratsuba for implementation comments.
+ // Squaring is simpler, since it never gets asymmetric.
+ assert(result.length < uint.max, "Operands too large");
+ assert(result.length == 2*x.length);
+ if (x.length <= KARATSUBASQUARELIMIT)
+ {
+ return squareSimple(result, x);
+ }
+ // half length, round up.
+ auto half = (x.length >> 1) + (x.length & 1);
+
+ const(BigDigit)[] x0 = x[0 .. half];
+ const(BigDigit)[] x1 = x[half .. $];
+ BigDigit [] mid = scratchbuff[0 .. half*2];
+ BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
+ // initially use result to store temporaries
+ BigDigit [] xdiff= result[0 .. half];
+ const BigDigit [] ydiff = result[half .. half*2];
+
+ // First, we calculate mid. We don't need its sign
+ inplaceSub(xdiff, x0, x1);
+ squareKaratsuba(mid, xdiff, newscratchbuff);
+
+ // Set result = x0x0 + (N*N)*x1x1
+ squareKaratsuba(result[0 .. 2*half], x0, newscratchbuff);
+ squareKaratsuba(result[2*half .. $], x1, newscratchbuff);
+
+ /* result += N * (x0x0 + x1x1)
+ Do this with three half-length additions. With a = x0x0, b = x1x1:
+ R1 = aHI + bLO + aLO
+ R2 = aHI + bLO + aHI + carry_from_R1
+ R3 = bHi + carry_from_R2
+ */
+ BigDigit[] R1 = result[half .. half*2];
+ BigDigit[] R2 = result[half*2 .. half*3];
+ BigDigit[] R3 = result[half*3..$];
+ BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
+ BigDigit c2 = multibyteAdd(R1, R2, result[0 .. half], 0); // c2:R1 = R2 + R1 + R0
+ BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
+ if (c1+c2) multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
+ if (c1+c3) multibyteIncrementAssign!('+')(R3, c1+c3);
+
+ // And finally we subtract mid, which is always positive
+ subAssignSimple(result[half..$], mid);
+}
+
+/* Knuth's Algorithm D, as presented in
+ * H.S. Warren, "Hacker's Delight", Addison-Wesley Professional (2002).
+ * Also described in "Modern Computer Arithmetic" 0.2, Exercise 1.8.18.
+ * Given u and v, calculates quotient = u / v, u = u % v.
+ * v must be normalized (ie, the MSB of v must be 1).
+ * The most significant words of quotient and u may be zero.
+ * u[0 .. v.length] holds the remainder.
+ */
+void schoolbookDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
+ pure nothrow
+{
+ assert(quotient.length == u.length - v.length);
+ assert(v.length > 1);
+ assert(u.length >= v.length);
+ assert((v[$-1]&0x8000_0000)!=0);
+ assert(u[$-1] < v[$-1]);
+ // BUG: This code only works if BigDigit is uint.
+ uint vhi = v[$-1];
+ uint vlo = v[$-2];
+
+ for (ptrdiff_t j = u.length - v.length - 1; j >= 0; j--)
+ {
+ // Compute estimate of quotient[j],
+ // qhat = (three most significant words of u)/(two most sig words of v).
+ uint qhat;
+ if (u[j + v.length] == vhi)
+ {
+ // uu/vhi could exceed uint.max (it will be 0x8000_0000 or 0x8000_0001)
+ qhat = uint.max;
+ }
+ else
+ {
+ uint ulo = u[j + v.length - 2];
+ version (D_InlineAsm_X86)
+ {
+ // Note: On DMD, this is only ~10% faster than the non-asm code.
+ uint *p = &u[j + v.length - 1];
+ asm pure nothrow
+ {
+ mov EAX, p;
+ mov EDX, [EAX+4];
+ mov EAX, [EAX];
+ div dword ptr [vhi];
+ mov qhat, EAX;
+ mov ECX, EDX;
+div3by2correction:
+ mul dword ptr [vlo]; // EDX:EAX = qhat * vlo
+ sub EAX, ulo;
+ sbb EDX, ECX;
+ jbe div3by2done;
+ mov EAX, qhat;
+ dec EAX;
+ mov qhat, EAX;
+ add ECX, dword ptr [vhi];
+ jnc div3by2correction;
+div3by2done: ;
+ }
+ }
+ else
+ { // version (InlineAsm)
+ ulong uu = (cast(ulong)(u[j + v.length]) << 32) | u[j + v.length - 1];
+ immutable bigqhat = uu / vhi;
+ ulong rhat = uu - bigqhat * vhi;
+ qhat = cast(uint) bigqhat;
+again:
+ if (cast(ulong) qhat * vlo > ((rhat << 32) + ulo))
+ {
+ --qhat;
+ rhat += vhi;
+ if (!(rhat & 0xFFFF_FFFF_0000_0000L))
+ goto again;
+ }
+ } // version (InlineAsm)
+ }
+ // Multiply and subtract.
+ uint carry = multibyteMulAdd!('-')(u[j .. j + v.length], v, qhat, 0);
+
+ if (u[j+v.length] < carry)
+ {
+ // If we subtracted too much, add back
+ --qhat;
+ carry -= multibyteAdd(u[j .. j + v.length],u[j .. j + v.length], v, 0);
+ }
+ quotient[j] = qhat;
+ u[j + v.length] = u[j + v.length] - carry;
+ }
+}
+
+private:
+
+// TODO: Replace with a library call
+void itoaZeroPadded(char[] output, uint value)
+ pure nothrow @safe @nogc
+{
+ for (auto i = output.length; i--;)
+ {
+ if (value < 10)
+ {
+ output[i] = cast(char)(value + '0');
+ value = 0;
+ }
+ else
+ {
+ output[i] = cast(char)(value % 10 + '0');
+ value /= 10;
+ }
+ }
+}
+
+void toHexZeroPadded(char[] output, uint value,
+ LetterCase letterCase = LetterCase.upper) pure nothrow @safe
+{
+ ptrdiff_t x = output.length - 1;
+ static immutable string upperHexDigits = "0123456789ABCDEF";
+ static immutable string lowerHexDigits = "0123456789abcdef";
+ for ( ; x >= 0; --x)
+ {
+ if (letterCase == LetterCase.upper)
+ {
+ output[x] = upperHexDigits[value & 0xF];
+ }
+ else
+ {
+ output[x] = lowerHexDigits[value & 0xF];
+ }
+ value >>= 4;
+ }
+}
+
+private:
+
+// Returns the highest value of i for which left[i]!=right[i],
+// or 0 if left[] == right[]
+size_t highestDifferentDigit(const BigDigit [] left, const BigDigit [] right)
+pure nothrow @nogc @safe
+{
+ assert(left.length == right.length);
+ for (ptrdiff_t i = left.length - 1; i>0; --i)
+ {
+ if (left[i] != right[i])
+ return i;
+ }
+ return 0;
+}
+
+// Returns the lowest value of i for which x[i]!=0.
+int firstNonZeroDigit(const BigDigit [] x) pure nothrow @nogc @safe
+{
+ int k = 0;
+ while (x[k]==0)
+ {
+ ++k;
+ assert(k<x.length);
+ }
+ return k;
+}
+
+/*
+ Calculate quotient and remainder of u / v using fast recursive division.
+ v must be normalised, and must be at least half as long as u.
+ Given u and v, v normalised, calculates quotient = u/v, u = u%v.
+ scratch is temporary storage space, length must be >= quotient + 1.
+
+Returns:
+ u[0 .. v.length] is the remainder. u[v.length..$] is corrupted.
+
+ Implements algorithm 1.8 from MCA.
+ This algorithm has an annoying special case. After the first recursion, the
+ highest bit of the quotient may be set. This means that in the second
+ recursive call, the 'in' contract would be violated. (This happens only
+ when the top quarter of u is equal to the top half of v. A base 10
+ equivalent example of this situation is 5517/56; the first step gives
+ 55/5 = 11). To maintain the in contract, we pad a zero to the top of both
+ u and the quotient. 'mayOverflow' indicates that that the special case
+ has occurred.
+ (In MCA, a different strategy is used: the in contract is weakened, and
+ schoolbookDivMod is more general: it allows the high bit of u to be set).
+ See also:
+ - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022,
+ Max-Planck Institute fuer Informatik, (Oct 1998).
+*/
+void recursiveDivMod(BigDigit[] quotient, BigDigit[] u, const(BigDigit)[] v,
+ BigDigit[] scratch, bool mayOverflow = false)
+ pure nothrow
+in
+{
+ // v must be normalized
+ assert(v.length > 1);
+ assert((v[$ - 1] & 0x8000_0000) != 0);
+ assert(!(u[$ - 1] & 0x8000_0000));
+ assert(quotient.length == u.length - v.length);
+ if (mayOverflow)
+ {
+ assert(u[$-1] == 0);
+ assert(u[$-2] & 0x8000_0000);
+ }
+
+ // Must be symmetric. Use block schoolbook division if not.
+ assert((mayOverflow ? u.length-1 : u.length) <= 2 * v.length);
+ assert((mayOverflow ? u.length-1 : u.length) >= v.length);
+ assert(scratch.length >= quotient.length + (mayOverflow ? 0 : 1));
+}
+body
+{
+ if (quotient.length < FASTDIVLIMIT)
+ {
+ return schoolbookDivMod(quotient, u, v);
+ }
+
+ // Split quotient into two halves, but keep padding in the top half
+ auto k = (mayOverflow ? quotient.length - 1 : quotient.length) >> 1;
+
+ // RECURSION 1: Calculate the high half of the quotient
+
+ // Note that if u and quotient were padded, they remain padded during
+ // this call, so in contract is satisfied.
+ recursiveDivMod(quotient[k .. $], u[2 * k .. $], v[k .. $],
+ scratch, mayOverflow);
+
+ // quotient[k..$] is our guess at the high quotient.
+ // u[2*k .. 2.*k + v.length - k = k + v.length] is the high part of the
+ // first remainder. u[0 .. 2*k] is the low part.
+
+ // Calculate the full first remainder to be
+ // remainder - highQuotient * lowDivisor
+ // reducing highQuotient until the remainder is positive.
+ // The low part of the remainder, u[0 .. k], cannot be altered by this.
+
+ adjustRemainder(quotient[k .. $], u[k .. k + v.length], v, k,
+ scratch[0 .. quotient.length], mayOverflow);
+
+ // RECURSION 2: Calculate the low half of the quotient
+ // The full first remainder is now in u[0 .. k + v.length].
+
+ if (u[k + v.length - 1] & 0x8000_0000)
+ {
+ // Special case. The high quotient is 0x1_00...000 or 0x1_00...001.
+ // This means we need an extra quotient word for the next recursion.
+ // We need to restore the invariant for the recursive calls.
+ // We do this by padding both u and quotient. Extending u is trivial,
+ // because the higher words will not be used again. But for the
+ // quotient, we're clobbering the low word of the high quotient,
+ // so we need save it, and add it back in after the recursive call.
+
+ auto clobberedQuotient = quotient[k];
+ u[k+v.length] = 0;
+
+ recursiveDivMod(quotient[0 .. k+1], u[k .. k + v.length+1],
+ v[k .. $], scratch, true);
+ adjustRemainder(quotient[0 .. k+1], u[0 .. v.length], v, k,
+ scratch[0 .. 2 * k+1], true);
+
+ // Now add the quotient word that got clobbered earlier.
+ multibyteIncrementAssign!('+')(quotient[k..$], clobberedQuotient);
+ }
+ else
+ {
+ // The special case has NOT happened.
+ recursiveDivMod(quotient[0 .. k], u[k .. k + v.length], v[k .. $],
+ scratch, false);
+
+ // high remainder is in u[k .. k+(v.length-k)] == u[k .. v.length]
+
+ adjustRemainder(quotient[0 .. k], u[0 .. v.length], v, k,
+ scratch[0 .. 2 * k]);
+ }
+}
+
+// rem -= quot * v[0 .. k].
+// If would make rem negative, decrease quot until rem is >= 0.
+// Needs (quot.length * k) scratch space to store the result of the multiply.
+void adjustRemainder(BigDigit[] quot, BigDigit[] rem, const(BigDigit)[] v,
+ ptrdiff_t k,
+ BigDigit[] scratch, bool mayOverflow = false) pure nothrow
+{
+ assert(rem.length == v.length);
+ mulInternal(scratch, quot, v[0 .. k]);
+ uint carry = 0;
+ if (mayOverflow)
+ carry = scratch[$-1] + subAssignSimple(rem, scratch[0..$-1]);
+ else
+ carry = subAssignSimple(rem, scratch);
+ while (carry)
+ {
+ multibyteIncrementAssign!('-')(quot, 1); // quot--
+ carry -= multibyteAdd(rem, rem, v, 0);
+ }
+}
+
+// Cope with unbalanced division by performing block schoolbook division.
+void blockDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
+pure nothrow
+{
+ import core.memory : GC;
+ assert(quotient.length == u.length - v.length);
+ assert(v.length > 1);
+ assert(u.length >= v.length);
+ assert((v[$-1] & 0x8000_0000)!=0);
+ assert((u[$-1] & 0x8000_0000)==0);
+ BigDigit [] scratch = new BigDigit[v.length + 1];
+
+ // Perform block schoolbook division, with 'v.length' blocks.
+ auto m = u.length - v.length;
+ while (m > v.length)
+ {
+ immutable mayOverflow = (u[m + v.length -1 ] & 0x8000_0000)!=0;
+ BigDigit saveq;
+ if (mayOverflow)
+ {
+ u[m + v.length] = 0;
+ saveq = quotient[m];
+ }
+ recursiveDivMod(quotient[m-v.length .. m + (mayOverflow? 1: 0)],
+ u[m - v.length .. m + v.length + (mayOverflow? 1: 0)], v, scratch, mayOverflow);
+ if (mayOverflow)
+ {
+ assert(quotient[m] == 0);
+ quotient[m] = saveq;
+ }
+ m -= v.length;
+ }
+ recursiveDivMod(quotient[0 .. m], u[0 .. m + v.length], v, scratch);
+ () @trusted { GC.free(scratch.ptr); } ();
+}
+
+@system unittest
+{
+ import core.stdc.stdio;
+
+ void printBiguint(const uint [] data)
+ {
+ char [] buff = biguintToHex(new char[data.length*9], data, '_');
+ printf("%.*s\n", buff.length, buff.ptr);
+ }
+
+ void printDecimalBigUint(BigUint data)
+ {
+ auto str = data.toDecimalString(0);
+ printf("%.*s\n", str.length, str.ptr);
+ }
+
+ uint [] a, b;
+ a = new uint[43];
+ b = new uint[179];
+ for (int i=0; i<a.length; ++i) a[i] = 0x1234_B6E9 + i;
+ for (int i=0; i<b.length; ++i) b[i] = 0x1BCD_8763 - i*546;
+
+ a[$-1] |= 0x8000_0000;
+ uint [] r = new uint[a.length];
+ uint [] q = new uint[b.length-a.length+1];
+
+ divModInternal(q, r, b, a);
+ q = q[0..$-1];
+ uint [] r1 = r.dup;
+ uint [] q1 = q.dup;
+ blockDivMod(q, b, a);
+ r = b[0 .. a.length];
+ assert(r[] == r1[]);
+ assert(q[] == q1[]);
+}
+
+// biguintToOctal
+@safe unittest
+{
+ enum bufSize = 5 * BigDigitBits / 3 + 1;
+ auto buf = new char[bufSize];
+ size_t i;
+ BigDigit[] data = [ 342391 ];
+
+ // Basic functionality with single word
+ i = biguintToOctal(buf, data);
+ assert(i == bufSize - 7 && buf[i .. $] == "1234567");
+
+ // Test carrying bits between words
+ data = [ 0x77053977, 0x39770539, 0x00000005 ];
+ i = biguintToOctal(buf, data);
+ assert(i == bufSize - 23 && buf[i .. $] == "12345670123456701234567");
+
+ // Test carried bits in the last word
+ data = [ 0x80000000 ];
+ i = biguintToOctal(buf, data);
+ assert(buf[i .. $] == "20000000000");
+
+ // Test boundary between 3rd and 4th word where the number of bits is
+ // divisible by 3 and no bits should be carried.
+ //
+ // The 0xC0000000's are for "poisoning" the carry to be non-zero when the
+ // rollover happens, so that if any bugs happen in wrongly adding the carry
+ // to the next word, non-zero bits will show up in the output.
+ data = [ 0xC0000000, 0xC0000000, 0xC0000000, 0x00000010 ];
+ i = biguintToOctal(buf, data);
+ assert(buf[i .. $] == "2060000000001400000000030000000000");
+
+ // Boundary case: 0
+ data = [ 0 ];
+ i = biguintToOctal(buf, data);
+ assert(buf[i .. $] == "0");
+}
diff --git a/libphobos/src/std/internal/math/biguintnoasm.d b/libphobos/src/std/internal/math/biguintnoasm.d
new file mode 100644
index 0000000..aea1d50
--- /dev/null
+++ b/libphobos/src/std/internal/math/biguintnoasm.d
@@ -0,0 +1,370 @@
+/** Arbitrary precision arithmetic ('bignum') for processors with no asm support
+ *
+ * All functions operate on arrays of uints, stored LSB first.
+ * If there is a destination array, it will be the first parameter.
+ * Currently, all of these functions are subject to change, and are
+ * intended for internal use only.
+ * This module is intended only to assist development of high-speed routines
+ * on currently unsupported processors.
+ * The X86 asm version is about 30 times faster than the D version (DMD).
+ */
+
+/* Copyright Don Clugston 2008 - 2010.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+
+module std.internal.math.biguintnoasm;
+
+nothrow:
+@safe:
+
+public:
+alias BigDigit = uint; // A Bignum is an array of BigDigits.
+
+ // Limits for when to switch between multiplication algorithms.
+enum int KARATSUBALIMIT = 10; // Minimum value for which Karatsuba is worthwhile.
+enum int KARATSUBASQUARELIMIT = 12; // Minimum value for which square Karatsuba is worthwhile
+
+
+/** Multi-byte addition or subtraction
+ * dest[] = src1[] + src2[] + carry (0 or 1).
+ * or dest[] = src1[] - src2[] - carry (0 or 1).
+ * Returns carry or borrow (0 or 1).
+ * Set op == '+' for addition, '-' for subtraction.
+ */
+uint multibyteAddSub(char op)(uint[] dest, const(uint) [] src1,
+ const (uint) [] src2, uint carry) pure @nogc @safe
+{
+ ulong c = carry;
+ for (size_t i = 0; i < src2.length; ++i)
+ {
+ static if (op=='+') c = c + src1[i] + src2[i];
+ else c = cast(ulong) src1[i] - src2[i] - c;
+ dest[i] = cast(uint) c;
+ c = (c > 0xFFFF_FFFF);
+ }
+ return cast(uint) c;
+}
+
+@safe unittest
+{
+ uint [] a = new uint[40];
+ uint [] b = new uint[40];
+ uint [] c = new uint[40];
+ for (size_t i = 0; i < a.length; ++i)
+ {
+ if (i&1) a[i]=cast(uint)(0x8000_0000 + i);
+ else a[i]=cast(uint) i;
+ b[i]= 0x8000_0003;
+ }
+ c[19]=0x3333_3333;
+ uint carry = multibyteAddSub!('+')(c[0 .. 18], b[0 .. 18], a[0 .. 18], 0);
+ assert(c[0]==0x8000_0003);
+ assert(c[1]==4);
+ assert(c[19]==0x3333_3333); // check for overrun
+ assert(carry == 1);
+ for (size_t i = 0; i < a.length; ++i)
+ {
+ a[i] = b[i] = c[i] = 0;
+ }
+ a[8]=0x048D159E;
+ b[8]=0x048D159E;
+ a[10]=0x1D950C84;
+ b[10]=0x1D950C84;
+ a[5] =0x44444444;
+ carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
+ assert(a[11] == 0);
+ for (size_t i = 0; i < 10; ++i)
+ if (i != 5)
+ assert(a[i] == 0);
+
+ for (size_t q = 3; q < 36; ++q)
+ {
+ for (size_t i = 0; i< a.length; ++i)
+ {
+ a[i] = b[i] = c[i] = 0;
+ }
+ a[q-2]=0x040000;
+ b[q-2]=0x040000;
+ carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
+ assert(a[q-2]==0);
+ }
+}
+
+
+
+/** dest[] += carry, or dest[] -= carry.
+ * op must be '+' or '-'
+ * Returns final carry or borrow (0 or 1)
+ */
+uint multibyteIncrementAssign(char op)(uint[] dest, uint carry)
+ pure @nogc @safe
+{
+ static if (op=='+')
+ {
+ ulong c = carry;
+ c += dest[0];
+ dest[0] = cast(uint) c;
+ if (c <= 0xFFFF_FFFF)
+ return 0;
+
+ for (size_t i = 1; i < dest.length; ++i)
+ {
+ ++dest[i];
+ if (dest[i] != 0)
+ return 0;
+ }
+ return 1;
+ }
+ else
+ {
+ ulong c = carry;
+ c = dest[0] - c;
+ dest[0] = cast(uint) c;
+ if (c <= 0xFFFF_FFFF)
+ return 0;
+ for (size_t i = 1; i < dest.length; ++i)
+ {
+ --dest[i];
+ if (dest[i] != 0xFFFF_FFFF)
+ return 0;
+ }
+ return 1;
+ }
+}
+
+/** dest[] = src[] << numbits
+ * numbits must be in the range 1 .. 31
+ */
+uint multibyteShl(uint [] dest, const(uint) [] src, uint numbits)
+ pure @nogc @safe
+{
+ ulong c = 0;
+ for (size_t i = 0; i < dest.length; ++i)
+ {
+ c += (cast(ulong)(src[i]) << numbits);
+ dest[i] = cast(uint) c;
+ c >>>= 32;
+ }
+ return cast(uint) c;
+}
+
+
+/** dest[] = src[] >> numbits
+ * numbits must be in the range 1 .. 31
+ */
+void multibyteShr(uint [] dest, const(uint) [] src, uint numbits)
+ pure @nogc @safe
+{
+ ulong c = 0;
+ for (ptrdiff_t i = dest.length; i != 0; --i)
+ {
+ c += (src[i-1] >>numbits) + (cast(ulong)(src[i-1]) << (64 - numbits));
+ dest[i-1] = cast(uint) c;
+ c >>>= 32;
+ }
+}
+
+@safe unittest
+{
+
+ uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteShr(aa[0..$-2], aa, 4);
+ assert(aa[0] == 0x6122_2222 && aa[1] == 0xA455_5555 && aa[2] == 0x0899_9999);
+ assert(aa[3] == 0xBCCC_CCCD);
+
+ aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteShr(aa[0..$-1], aa, 4);
+ assert(aa[0] == 0x6122_2222 && aa[1] == 0xA455_5555
+ && aa[2] == 0xD899_9999 && aa[3] == 0x0BCC_CCCC);
+
+ aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD,
+ 0xEEEE_EEEE];
+ multibyteShl(aa[1 .. 4], aa[1..$], 4);
+ assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230
+ && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+/** dest[] = src[] * multiplier + carry.
+ * Returns carry.
+ */
+uint multibyteMul(uint[] dest, const(uint)[] src, uint multiplier, uint carry)
+ pure @nogc @safe
+{
+ assert(dest.length == src.length);
+ ulong c = carry;
+ for (size_t i = 0; i < src.length; ++i)
+ {
+ c += cast(ulong)(src[i]) * multiplier;
+ dest[i] = cast(uint) c;
+ c>>=32;
+ }
+ return cast(uint) c;
+}
+
+@safe unittest
+{
+ uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A,
+ 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
+ assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && aa[2]==0x5555_5561
+ && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+/**
+ * dest[] += src[] * multiplier + carry(0 .. FFFF_FFFF).
+ * Returns carry out of MSB (0 .. FFFF_FFFF).
+ */
+uint multibyteMulAdd(char op)(uint [] dest, const(uint)[] src,
+ uint multiplier, uint carry) pure @nogc @safe
+{
+ assert(dest.length == src.length);
+ ulong c = carry;
+ for (size_t i = 0; i < src.length; ++i)
+ {
+ static if (op=='+')
+ {
+ c += cast(ulong)(multiplier) * src[i] + dest[i];
+ dest[i] = cast(uint) c;
+ c >>= 32;
+ }
+ else
+ {
+ c += cast(ulong) multiplier * src[i];
+ ulong t = cast(ulong) dest[i] - cast(uint) c;
+ dest[i] = cast(uint) t;
+ c = cast(uint)((c >> 32) - (t >> 32));
+ }
+ }
+ return cast(uint) c;
+}
+
+@safe unittest
+{
+
+ uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A,
+ 0xBCCC_CCCD, 0xEEEE_EEEE];
+ uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0,
+ 0xC0C0_C0C0];
+ multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
+ assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
+ assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0 + 5
+ && bb[2] == 0x5555_5561 + 0x00C0_C0C0 + 1
+ && bb[3] == 0x9999_99A4 + 0xF0F0_F0F0 );
+}
+
+
+/**
+ Sets result = result[0 .. left.length] + left * right
+
+ It is defined in this way to allow cache-efficient multiplication.
+ This function is equivalent to:
+ ----
+ for (size_t i = 0; i< right.length; ++i)
+ {
+ dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
+ left, right[i], 0);
+ }
+ ----
+ */
+void multibyteMultiplyAccumulate(uint [] dest, const(uint)[] left, const(uint)
+ [] right) pure @nogc @safe
+{
+ for (size_t i = 0; i < right.length; ++i)
+ {
+ dest[left.length + i] = multibyteMulAdd!('+')(dest[i .. left.length+i],
+ left, right[i], 0);
+ }
+}
+
+/** dest[] /= divisor.
+ * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
+ */
+uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow)
+ pure @nogc @safe
+{
+ ulong c = cast(ulong) overflow;
+ for (ptrdiff_t i = dest.length-1; i >= 0; --i)
+ {
+ c = (c << 32) + cast(ulong)(dest[i]);
+ uint q = cast(uint)(c/divisor);
+ c -= divisor * q;
+ dest[i] = q;
+ }
+ return cast(uint) c;
+}
+
+@safe unittest
+{
+ uint [] aa = new uint[101];
+ for (uint i = 0; i < aa.length; ++i)
+ aa[i] = 0x8765_4321 * (i+3);
+ uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
+ uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
+ for (uint i=0; i<aa.length; ++i)
+ {
+ assert(aa[i] == 0x8765_4321 * (i+3));
+ }
+ assert(r == 0x33FF_7461);
+
+}
+// Set dest[2*i .. 2*i+1]+=src[i]*src[i]
+void multibyteAddDiagonalSquares(uint[] dest, const(uint)[] src)
+ pure @nogc @safe
+{
+ ulong c = 0;
+ for (size_t i = 0; i < src.length; ++i)
+ {
+ // At this point, c is 0 or 1, since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
+ c += cast(ulong)(src[i]) * src[i] + dest[2*i];
+ dest[2*i] = cast(uint) c;
+ c = (c>>=32) + dest[2*i+1];
+ dest[2*i+1] = cast(uint) c;
+ c >>= 32;
+ }
+}
+
+// Does half a square multiply. (square = diagonal + 2*triangle)
+void multibyteTriangleAccumulate(uint[] dest, const(uint)[] x)
+ pure @nogc @safe
+{
+ // x[0]*x[1...$] + x[1]*x[2..$] + ... + x[$-2]x[$-1..$]
+ dest[x.length] = multibyteMul(dest[1 .. x.length], x[1..$], x[0], 0);
+ if (x.length < 4)
+ {
+ if (x.length == 3)
+ {
+ ulong c = cast(ulong)(x[$-1]) * x[$-2] + dest[2*x.length-3];
+ dest[2*x.length - 3] = cast(uint) c;
+ c >>= 32;
+ dest[2*x.length - 2] = cast(uint) c;
+ }
+ return;
+ }
+ for (size_t i = 2; i < x.length - 2; ++i)
+ {
+ dest[i-1+ x.length] = multibyteMulAdd!('+')(
+ dest[i+i-1 .. i+x.length-1], x[i..$], x[i-1], 0);
+ }
+ // Unroll the last two entries, to reduce loop overhead:
+ ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[2*x.length-5];
+ dest[2*x.length-5] = cast(uint) c;
+ c >>= 32;
+ c += cast(ulong)(x[$-3]) * x[$-1] + dest[2*x.length-4];
+ dest[2*x.length-4] = cast(uint) c;
+ c >>= 32;
+ c += cast(ulong)(x[$-1]) * x[$-2];
+ dest[2*x.length-3] = cast(uint) c;
+ c >>= 32;
+ dest[2*x.length-2] = cast(uint) c;
+}
+
+void multibyteSquare(BigDigit[] result, const(BigDigit) [] x) pure @nogc @safe
+{
+ multibyteTriangleAccumulate(result, x);
+ result[$-1] = multibyteShl(result[1..$-1], result[1..$-1], 1); // mul by 2
+ result[0] = 0;
+ multibyteAddDiagonalSquares(result, x);
+}
diff --git a/libphobos/src/std/internal/math/biguintx86.d b/libphobos/src/std/internal/math/biguintx86.d
new file mode 100644
index 0000000..bd03d2e
--- /dev/null
+++ b/libphobos/src/std/internal/math/biguintx86.d
@@ -0,0 +1,1353 @@
+/** Optimised asm arbitrary precision arithmetic ('bignum')
+ * routines for X86 processors.
+ *
+ * All functions operate on arrays of uints, stored LSB first.
+ * If there is a destination array, it will be the first parameter.
+ * Currently, all of these functions are subject to change, and are
+ * intended for internal use only.
+ * The symbol [#] indicates an array of machine words which is to be
+ * interpreted as a multi-byte number.
+ */
+
+/* Copyright Don Clugston 2008 - 2010.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+/**
+ * In simple terms, there are 3 modern x86 microarchitectures:
+ * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
+ * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
+ * (c) the Pentium 4, produced by Marketing.
+ *
+ * This code has been optimised for the Intel P6 family.
+ * Generally the code remains near-optimal for Intel Core2/Corei7, after
+ * translating EAX-> RAX, etc, since all these CPUs use essentially the same
+ * pipeline, and are typically limited by memory access.
+ * The code uses techniques described in Agner Fog's superb Pentium manuals
+ * available at www.agner.org.
+ * Not optimised for AMD, which can do two memory loads per cycle (Intel
+ * CPUs can only do one). Despite this, performance is superior on AMD.
+ * Performance is dreadful on P4.
+ *
+ * Timing results (cycles per int)
+ * --Intel Pentium-- --AMD--
+ * PM P4 Core2 K7
+ * +,- 2.25 15.6 2.25 1.5
+ * <<,>> 2.0 6.6 2.0 5.0
+ * (<< MMX) 1.7 5.3 1.5 1.2
+ * * 5.0 15.0 4.0 4.3
+ * mulAdd 5.7 19.0 4.9 4.0
+ * div 30.0 32.0 32.0 22.4
+ * mulAcc(32) 6.5 20.0 5.4 4.9
+ *
+ * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
+ * function call overhead.
+ * The timing for Div is quite unpredictable, but it's probably too slow
+ * to be useful. On 64-bit processors, these times should
+ * halve if run in 64-bit mode, except for the MMX functions.
+ */
+
+module std.internal.math.biguintx86;
+
+@system:
+pure:
+nothrow:
+
+/*
+ Naked asm is used throughout, because:
+ (a) it frees up the EBP register
+ (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
+*/
+
+version (D_InlineAsm_X86)
+{
+
+private:
+
+/* Duplicate string s, with n times, substituting index for '@'.
+ *
+ * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
+ * function for some of the asm routines.
+ */
+string indexedLoopUnroll(int n, string s) pure @safe
+{
+ string u;
+ for (int i = 0; i<n; ++i)
+ {
+ string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
+
+ int last = 0;
+ for (int j = 0; j<s.length; ++j)
+ {
+ if (s[j]=='@')
+ {
+ u ~= s[last .. j] ~ nstr;
+ last = j+1;
+ }
+ }
+ if (last<s.length) u = u ~ s[last..$];
+
+ }
+ return u;
+}
+@safe unittest
+{
+ assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
+}
+
+public:
+
+alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
+
+// Limits for when to switch between multiplication algorithms.
+enum : int { KARATSUBALIMIT = 18 }; // Minimum value for which Karatsuba is worthwhile.
+enum : int { KARATSUBASQUARELIMIT=26 }; // Minimum value for which square Karatsuba is worthwhile
+
+/** Multi-byte addition or subtraction
+ * dest[#] = src1[#] + src2[#] + carry (0 or 1).
+ * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
+ * Returns carry or borrow (0 or 1).
+ * Set op == '+' for addition, '-' for subtraction.
+ */
+uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
+ src2, uint carry) pure
+{
+ // Timing:
+ // Pentium M: 2.25/int
+ // P6 family, Core2 have a partial flags stall when reading the carry flag in
+ // an ADC, SBB operation after an operation such as INC or DEC which
+ // modifies some, but not all, flags. We avoid this by storing carry into
+ // a resister (AL), and restoring it after the branch.
+
+ enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push EDI;
+ push EBX;
+ push ESI;
+ mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
+ mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
+ mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
+ mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
+ // Carry is in EAX
+ // Count UP to zero (from -len) to minimize loop overhead.
+ lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
+ lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
+ lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
+
+ neg ECX;
+ add ECX, 8;
+ jb L2; // if length < 8 , bypass the unrolled loop.
+L_unrolled:
+ shr AL, 1; // get carry from EAX
+ }
+ mixin(" asm pure nothrow {"
+ ~ indexedLoopUnroll( 8,
+ "mov EAX, [@*4-8*4+EDX+ECX*4];"
+ ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
+ ~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
+ ~ "}");
+ asm pure nothrow {
+ setc AL; // save carry
+ add ECX, 8;
+ ja L_unrolled;
+L2: // Do the residual 1 .. 7 ints.
+
+ sub ECX, 8;
+ jz done;
+L_residual:
+ shr AL, 1; // get carry from EAX
+ }
+ mixin(" asm pure nothrow {"
+ ~ indexedLoopUnroll( 1,
+ "mov EAX, [@*4+EDX+ECX*4];"
+ ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
+ ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
+ asm pure nothrow {
+ setc AL; // save carry
+ add ECX, 1;
+ jnz L_residual;
+done:
+ and EAX, 1; // make it O or 1.
+ pop ESI;
+ pop EBX;
+ pop EDI;
+ ret 6*4;
+ }
+}
+
+@system unittest
+{
+ uint [] a = new uint[40];
+ uint [] b = new uint[40];
+ uint [] c = new uint[40];
+ for (int i=0; i<a.length; ++i)
+ {
+ if (i&1) a[i]=0x8000_0000 + i;
+ else a[i]=i;
+ b[i]= 0x8000_0003;
+ }
+ c[19]=0x3333_3333;
+ uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
+ assert(carry == 1);
+ assert(c[0]==0x8000_0003);
+ assert(c[1]==4);
+ assert(c[19]==0x3333_3333); // check for overrun
+ for (int i=0; i<a.length; ++i)
+ {
+ a[i]=b[i]=c[i]=0;
+ }
+ a[8]=0x048D159E;
+ b[8]=0x048D159E;
+ a[10]=0x1D950C84;
+ b[10]=0x1D950C84;
+ a[5] =0x44444444;
+ carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
+ assert(a[11]==0);
+ for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
+
+ for (int q=3; q<36;++q)
+ {
+ for (int i=0; i<a.length; ++i)
+ {
+ a[i]=b[i]=c[i]=0;
+ }
+ a[q-2]=0x040000;
+ b[q-2]=0x040000;
+ carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
+ assert(a[q-2]==0);
+ }
+}
+
+/** dest[#] += carry, or dest[#] -= carry.
+ * op must be '+' or '-'
+ * Returns final carry or borrow (0 or 1)
+ */
+uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure
+{
+ enum { LASTPARAM = 1*4 } // 0* pushes + return address.
+ asm pure nothrow {
+ naked;
+ mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
+ mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
+ // EAX = carry
+L1: ;
+ }
+ static if (op=='+')
+ asm pure nothrow { add [EDX], EAX; }
+ else
+ asm pure nothrow { sub [EDX], EAX; }
+ asm pure nothrow {
+ mov EAX, 1;
+ jnc L2;
+ add EDX, 4;
+ dec ECX;
+ jnz L1;
+ mov EAX, 2;
+L2: dec EAX;
+ ret 2*4;
+ }
+}
+
+/** dest[#] = src[#] << numbits
+ * numbits must be in the range 1 .. 31
+ * Returns the overflow
+ */
+uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure
+{
+ // Timing: Optimal for P6 family.
+ // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
+ // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
+ enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+ mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+ mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+ mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+ mov ECX, EAX; // numbits;
+
+ mov EAX, [-4+ESI + 4*EBX];
+ mov EDX, 0;
+ shld EDX, EAX, CL;
+ push EDX; // Save return value
+ cmp EBX, 1;
+ jz L_last;
+ mov EDX, [-4+ESI + 4*EBX];
+ test EBX, 1;
+ jz L_odd;
+ sub EBX, 1;
+L_even:
+ mov EDX, [-4+ ESI + 4*EBX];
+ shld EAX, EDX, CL;
+ mov [EDI+4*EBX], EAX;
+L_odd:
+ mov EAX, [-8+ESI + 4*EBX];
+ shld EDX, EAX, CL;
+ mov [-4+EDI + 4*EBX], EDX;
+ sub EBX, 2;
+ jg L_even;
+L_last:
+ shl EAX, CL;
+ mov [EDI], EAX;
+ pop EAX; // pop return value
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+ }
+}
+
+/** dest[#] = src[#] >> numbits
+ * numbits must be in the range 1 .. 31
+ * This version uses MMX.
+ */
+uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure
+{
+ // Timing:
+ // K7 1.2/int. PM 1.7/int P4 5.3/int
+ enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+ mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+ mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+ mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+
+ movd MM3, EAX; // numbits = bits to shift left
+ xor EAX, 63;
+ align 16;
+ inc EAX;
+ movd MM4, EAX ; // 64-numbits = bits to shift right
+
+ // Get the return value into EAX
+ and EAX, 31; // EAX = 32-numbits
+ movd MM2, EAX; // 32-numbits
+ movd MM1, [ESI+4*EBX-4];
+ psrlq MM1, MM2;
+ movd EAX, MM1; // EAX = return value
+ test EBX, 1;
+ jz L_even;
+L_odd:
+ cmp EBX, 1;
+ jz L_length1;
+
+ // deal with odd lengths
+ movq MM1, [ESI+4*EBX-8];
+ psrlq MM1, MM2;
+ movd [EDI +4*EBX-4], MM1;
+ sub EBX, 1;
+L_even: // It's either singly or doubly even
+ movq MM2, [ESI + 4*EBX - 8];
+ psllq MM2, MM3;
+ sub EBX, 2;
+ jle L_last;
+ movq MM1, MM2;
+ add EBX, 2;
+ test EBX, 2;
+ jz L_onceeven;
+ sub EBX, 2;
+
+ // MAIN LOOP -- 128 bytes per iteration
+ L_twiceeven: // here MM2 is the carry
+ movq MM0, [ESI + 4*EBX-8];
+ psrlq MM0, MM4;
+ movq MM1, [ESI + 4*EBX-8];
+ psllq MM1, MM3;
+ por MM2, MM0;
+ movq [EDI +4*EBX], MM2;
+L_onceeven: // here MM1 is the carry
+ movq MM0, [ESI + 4*EBX-16];
+ psrlq MM0, MM4;
+ movq MM2, [ESI + 4*EBX-16];
+ por MM1, MM0;
+ movq [EDI +4*EBX-8], MM1;
+ psllq MM2, MM3;
+ sub EBX, 4;
+ jg L_twiceeven;
+L_last:
+ movq [EDI +4*EBX], MM2;
+L_alldone:
+ emms; // NOTE: costs 6 cycles on Intel CPUs
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+
+L_length1:
+ // length 1 is a special case
+ movd MM1, [ESI];
+ psllq MM1, MM3;
+ movd [EDI], MM1;
+ jmp L_alldone;
+ }
+}
+
+void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure
+{
+ enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+ mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+ mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+align 16;
+ mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+ lea ESI, [ESI + 4*EBX]; // ESI = end of src
+ neg EBX; // count UP to zero.
+
+ movd MM3, EAX; // numbits = bits to shift right
+ xor EAX, 63;
+ inc EAX;
+ movd MM4, EAX ; // 64-numbits = bits to shift left
+
+ test EBX, 1;
+ jz L_even;
+L_odd:
+ // deal with odd lengths
+ and EAX, 31; // EAX = 32-numbits
+ movd MM2, EAX; // 32-numbits
+ cmp EBX, -1;
+ jz L_length1;
+
+ movq MM0, [ESI+4*EBX];
+ psrlq MM0, MM3;
+ movd [EDI +4*EBX], MM0;
+ add EBX, 1;
+L_even:
+ movq MM2, [ESI + 4*EBX];
+ psrlq MM2, MM3;
+
+ movq MM1, MM2;
+ add EBX, 4;
+ cmp EBX, -2+4;
+ jz L_last;
+ // It's either singly or doubly even
+ sub EBX, 2;
+ test EBX, 2;
+ jnz L_onceeven;
+ add EBX, 2;
+
+ // MAIN LOOP -- 128 bytes per iteration
+ L_twiceeven: // here MM2 is the carry
+ movq MM0, [ESI + 4*EBX-8];
+ psllq MM0, MM4;
+ movq MM1, [ESI + 4*EBX-8];
+ psrlq MM1, MM3;
+ por MM2, MM0;
+ movq [EDI +4*EBX-16], MM2;
+L_onceeven: // here MM1 is the carry
+ movq MM0, [ESI + 4*EBX];
+ psllq MM0, MM4;
+ movq MM2, [ESI + 4*EBX];
+ por MM1, MM0;
+ movq [EDI +4*EBX-8], MM1;
+ psrlq MM2, MM3;
+ add EBX, 4;
+ jl L_twiceeven;
+L_last:
+ movq [EDI +4*EBX-16], MM2;
+L_alldone:
+ emms; // NOTE: costs 6 cycles on Intel CPUs
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+
+L_length1:
+ // length 1 is a special case
+ movd MM1, [ESI+4*EBX];
+ psrlq MM1, MM3;
+ movd [EDI +4*EBX], MM1;
+ jmp L_alldone;
+
+ }
+}
+
+/** dest[#] = src[#] >> numbits
+ * numbits must be in the range 1 .. 31
+ */
+void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure
+{
+ // Timing: Optimal for P6 family.
+ // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
+ // Terrible performance on AMD64, which has 7 cycles for SHRD!!
+ enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+ mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+ mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+ mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+ mov ECX, EAX; // numbits;
+
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+ lea ESI, [ESI + 4*EBX]; // ESI = end of src
+ neg EBX; // count UP to zero.
+ mov EAX, [ESI + 4*EBX];
+ cmp EBX, -1;
+ jz L_last;
+ mov EDX, [ESI + 4*EBX];
+ test EBX, 1;
+ jz L_odd;
+ add EBX, 1;
+L_even:
+ mov EDX, [ ESI + 4*EBX];
+ shrd EAX, EDX, CL;
+ mov [-4 + EDI+4*EBX], EAX;
+L_odd:
+ mov EAX, [4 + ESI + 4*EBX];
+ shrd EDX, EAX, CL;
+ mov [EDI + 4*EBX], EDX;
+ add EBX, 2;
+ jl L_even;
+L_last:
+ shr EAX, CL;
+ mov [-4 + EDI], EAX;
+
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+ }
+}
+
+@system unittest
+{
+
+ uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteShr(aa[0..$-1], aa, 4);
+ assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
+ && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
+
+ aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteShr(aa[2..$-1], aa[2..$-1], 4);
+ assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
+ && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
+
+ aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteShr(aa[0..$-2], aa, 4);
+ assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
+ assert(aa[0]==0x6122_2222);
+ assert(aa[3]==0xBCCC_CCCD);
+
+
+ aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
+ assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
+ && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
+ assert(r == 8);
+
+ aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
+ assert(aa[0] == 0xF0FF_FFFF
+ && aa[2]==0x5555_5561);
+ assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
+ assert(r == 8);
+ assert(aa[1]==0x2222_2230);
+
+ aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
+}
+
+/** dest[#] = src[#] * multiplier + carry.
+ * Returns carry.
+ */
+uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
+ pure
+{
+ // Timing: definitely not optimal.
+ // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
+ // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
+
+ enum { LASTPARAM = 4*4 } // 4* pushes + return address.
+ // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+ // when initializing variables to zero.
+ version (D_PIC)
+ {
+ enum { zero = 0 }
+ }
+ else
+ {
+ __gshared int zero = 0;
+ }
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+
+ mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+ mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+ mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
+ align 16;
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+ lea ESI, [ESI + 4*EBX]; // ESI = end of src
+ mov ECX, EAX; // [carry]; -- last param is in EAX.
+ neg EBX; // count UP to zero.
+ test EBX, 1;
+ jnz L_odd;
+ add EBX, 1;
+ L1:
+ mov EAX, [-4 + ESI + 4*EBX];
+ mul int ptr [ESP+LASTPARAM]; //[multiplier];
+ add EAX, ECX;
+ mov ECX, zero;
+ mov [-4+EDI + 4*EBX], EAX;
+ adc ECX, EDX;
+L_odd:
+ mov EAX, [ESI + 4*EBX]; // p2
+ mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
+ add EAX, ECX;
+ mov ECX, zero;
+ adc ECX, EDX;
+ mov [EDI + 4*EBX], EAX;
+ add EBX, 2;
+ jl L1;
+
+ mov EAX, ECX; // get final carry
+
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 5*4;
+ }
+}
+
+@system unittest
+{
+ uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
+ assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
+ aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+// The inner multiply-and-add loop, together with the Even entry point.
+// Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
+// This is the most time-critical code in the BigInt library.
+// It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
+string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure {
+ // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
+ // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
+ // (unit p2) for initializing registers to zero.
+ // There are also dependencies between the instructions, and we run up against the
+ // ROB-read limit (can only read 2 registers per cycle).
+ // We also need the number of uops in the loop to be a multiple of 3.
+ // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
+ // if Position-Independent Code is required.
+
+ // Register usage
+ // ESI = end of src
+ // EDI = end of dest
+ // EBX = index. Counts up to zero (in steps of 2).
+ // EDX:EAX = scratch, used in multiply.
+ // ECX = carry1.
+ // EBP = carry2.
+ // ESP = points to the multiplier.
+
+ // The first member of 'dest' which will be modified is [EDI+4*EBX].
+ // EAX must already contain the first member of 'src', [ESI+4*EBX].
+
+ version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
+ return "
+ // Entry point for even length
+ add EBX, 1;
+ mov EBP, ECX; // carry
+
+ mul int ptr [" ~ M_ADDRESS ~ "]; // M
+ mov ECX, 0;
+
+ add EBP, EAX;
+ mov EAX, [ESI+4*EBX];
+ adc ECX, EDX;
+
+ mul int ptr [" ~ M_ADDRESS ~ "]; // M
+ " ~ OP ~ " [-4+EDI+4*EBX], EBP;
+ mov EBP, zero;
+
+ adc ECX, EAX;
+ mov EAX, [4+ESI+4*EBX];
+
+ adc EBP, EDX;
+ add EBX, 2;
+ jnl L_done;
+L1:
+ mul int ptr [" ~ M_ADDRESS ~ "];
+ " ~ OP ~ " [-8+EDI+4*EBX], ECX;
+ adc EBP, EAX;
+ mov ECX, zero;
+ mov EAX, [ESI+4*EBX];
+ adc ECX, EDX;
+" ~
+ (using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
+~ "
+ mul int ptr [" ~ M_ADDRESS ~ "];
+ " ~ OP ~ " [-4+EDI+4*EBX], EBP;
+ mov EBP, zero;
+
+ adc ECX, EAX;
+ mov EAX, [4+ESI+4*EBX];
+
+ adc EBP, EDX;
+ add EBX, 2;
+ jl L1;
+L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
+ adc EBP, 0;
+";
+ // final carry is now in EBP
+}
+
+string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure
+{
+ return "
+ mul int ptr [" ~M_ADDRESS ~"];
+ mov EBP, zero;
+ add ECX, EAX;
+ mov EAX, [4+ESI+4*EBX];
+
+ adc EBP, EDX;
+ add EBX, 2;
+ jl L1;
+ jmp L_done;
+";
+}
+
+
+
+/**
+ * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
+ * where op == '+' or '-'
+ * Returns carry out of MSB (0 .. FFFF_FFFF).
+ */
+uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
+ multiplier, uint carry) pure {
+ // Timing: This is the most time-critical bignum function.
+ // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
+
+ // The main loop is pipelined and unrolled by 2,
+ // so entry to the loop is also complicated.
+
+ // Register usage
+ // EDX:EAX = multiply
+ // EBX = counter
+ // ECX = carry1
+ // EBP = carry2
+ // EDI = dest
+ // ESI = src
+
+ enum string OP = (op=='+')? "add" : "sub";
+ version (D_PIC)
+ {
+ enum { zero = 0 }
+ }
+ else
+ {
+ // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+ // when initializing registers to zero.
+ __gshared int zero = 0;
+ // use p3/p4 units
+ __gshared int storagenop; // write-only
+ }
+
+ enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+ asm pure nothrow {
+ naked;
+
+ push ESI;
+ push EDI;
+ push EBX;
+ push EBP;
+ mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+ mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+ align 16;
+ nop;
+ mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+ lea ESI, [ESI + 4*EBX]; // ESI = end of src
+ mov EBP, 0;
+ mov ECX, EAX; // ECX = input carry.
+ neg EBX; // count UP to zero.
+ mov EAX, [ESI+4*EBX];
+ test EBX, 1;
+ jnz L_enter_odd;
+ }
+ // Main loop, with entry point for even length
+ mixin("asm pure nothrow {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
+ asm pure nothrow {
+ mov EAX, EBP; // get final carry
+ pop EBP;
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 5*4;
+ }
+L_enter_odd:
+ mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
+}
+
+@system unittest
+{
+
+ uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+ uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
+ multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
+ assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
+ assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
+ && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
+}
+
+/**
+ Sets result[#] = result[0 .. left.length] + left[#] * right[#]
+
+ It is defined in this way to allow cache-efficient multiplication.
+ This function is equivalent to:
+ ----
+ for (int i = 0; i< right.length; ++i)
+ {
+ dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
+ left, right[i], 0);
+ }
+ ----
+ */
+void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
+ const uint [] right) pure {
+ // Register usage
+ // EDX:EAX = used in multiply
+ // EBX = index
+ // ECX = carry1
+ // EBP = carry2
+ // EDI = end of dest for this pass through the loop. Index for outer loop.
+ // ESI = end of left. never changes
+ // [ESP] = M = right[i] = multiplier for this pass through the loop.
+ // right.length is changed into dest.ptr+dest.length
+ version (D_PIC)
+ {
+ enum { zero = 0 }
+ }
+ else
+ {
+ // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+ // when initializing registers to zero.
+ __gshared int zero = 0;
+ // use p3/p4 units
+ __gshared int storagenop; // write-only
+ }
+
+ enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
+ asm pure nothrow {
+ naked;
+
+ push ESI;
+ push EDI;
+ align 16;
+ push EBX;
+ push EBP;
+ push EAX; // local variable M
+ mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
+ mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
+ mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
+
+ mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
+ lea EAX, [EDI + 4*EAX];
+ mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
+
+ lea ESI, [ESI + 4*EBX]; // ESI = end of left
+ mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
+ mov EAX, [EAX];
+ mov [ESP], EAX; // M
+outer_loop:
+ mov EBP, 0;
+ mov ECX, 0; // ECX = input carry.
+ neg EBX; // count UP to zero.
+ mov EAX, [ESI+4*EBX];
+ test EBX, 1;
+ jnz L_enter_odd;
+ }
+ // -- Inner loop, with even entry point
+ mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
+ asm pure nothrow {
+ mov [-4+EDI+4*EBX], EBP;
+ add EDI, 4;
+ cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
+ jz outer_done;
+ mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
+ mov EAX, [EAX+4]; // get new M
+ mov [ESP], EAX; // save new M
+ add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
+ mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
+ jmp outer_loop;
+outer_done:
+ pop EAX;
+ pop EBP;
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 6*4;
+ }
+L_enter_odd:
+ mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
+}
+
+/** dest[#] /= divisor.
+ * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
+ * divisor must not be a power of 2 (use right shift for that case;
+ * A division by zero will occur if divisor is a power of 2).
+ * Returns the final remainder
+ *
+ * Based on public domain code by Eric Bainville.
+ * (http://www.bealto.com/) Used with permission.
+ */
+uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure
+{
+ // Timing: limited by a horrible dependency chain.
+ // Pentium M: 18 cycles/op, 8 resource stalls/op.
+ // EAX, EDX = scratch, used by MUL
+ // EDI = dest
+ // CL = shift
+ // ESI = quotient
+ // EBX = remainderhi
+ // EBP = remainderlo
+ // [ESP-4] = mask
+ // [ESP] = kinv (2^64 /divisor)
+ enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+ enum { LOCALS = 2*4} // MASK, KINV
+ asm pure nothrow {
+ naked;
+
+ push ESI;
+ push EDI;
+ push EBX;
+ push EBP;
+
+ mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
+ mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
+
+ // Loop from msb to lsb
+ lea EDI, [EDI + 4*EBX];
+ mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
+ // Build the pseudo-inverse of divisor k: 2^64/k
+ // First determine the shift in ecx to get the max number of bits in kinv
+ xor ECX, ECX;
+ mov EAX, [ESP + LASTPARAM]; //divisor;
+ mov EDX, 1;
+kinv1:
+ inc ECX;
+ ror EDX, 1;
+ shl EAX, 1;
+ jnc kinv1;
+ dec ECX;
+ // Here, ecx is a left shift moving the msb of k to bit 32
+
+ mov EAX, 1;
+ shl EAX, CL;
+ dec EAX;
+ ror EAX, CL ; //ecx bits at msb
+ push EAX; // MASK
+
+ // Then divide 2^(32+cx) by divisor (edx already ok)
+ xor EAX, EAX;
+ div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor;
+ push EAX; // kinv
+ align 16;
+L2:
+ // Get 32 bits of quotient approx, multiplying
+ // most significant word of (rem*2^32+input)
+ mov EAX, [ESP+4]; //MASK;
+ and EAX, [EDI - 4];
+ or EAX, EBP;
+ rol EAX, CL;
+ mov EBX, EBP;
+ mov EBP, [EDI - 4];
+ mul int ptr [ESP]; //KINV;
+
+ shl EAX, 1;
+ rcl EDX, 1;
+
+ // Multiply by k and subtract to get remainder
+ // Subtraction must be done on two words
+ mov EAX, EDX;
+ mov ESI, EDX; // quot = high word
+ mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
+ sub EBP, EAX;
+ sbb EBX, EDX;
+ jz Lb; // high word is 0, goto adjust on single word
+
+ // Adjust quotient and remainder on two words
+Ld: inc ESI;
+ sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+ sbb EBX, 0;
+ jnz Ld;
+
+ // Adjust quotient and remainder on single word
+Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+ jc Lc; // rem in 0 .. divisor-1, OK
+ sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+ inc ESI;
+ jmp Lb;
+
+ // Store result
+Lc:
+ mov [EDI - 4], ESI;
+ lea EDI, [EDI - 4];
+ dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
+ jnz L2;
+
+ pop EAX; // discard kinv
+ pop EAX; // discard mask
+
+ mov EAX, EBP; // return final remainder
+ pop EBP;
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 3*4;
+ }
+}
+
+@system unittest
+{
+ uint [] aa = new uint[101];
+ for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
+ uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
+ uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
+ for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
+ assert(r == 0x33FF_7461);
+}
+
+// Set dest[2*i .. 2*i+1]+=src[i]*src[i]
+void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure
+{
+ /* Unlike mulAdd, the carry is only 1 bit,
+ since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
+ Note also that on the last iteration, no carry can occur.
+ As for multibyteAdd, we save & restore carry flag through the loop.
+
+ The timing is entirely dictated by the dependency chain. We could
+ improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
+ */
+ enum { LASTPARAM = 4*5 } // 4* pushes + return address.
+ asm pure nothrow {
+ naked;
+ push ESI;
+ push EDI;
+ push EBX;
+ push ECX;
+ mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+ mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
+ mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+ lea EDI, [EDI + 8*EBX]; // EDI = end of dest
+ lea ESI, [ESI + 4*EBX]; // ESI = end of src
+ neg EBX; // count UP to zero.
+ xor ECX, ECX; // initial carry = 0.
+L1:
+ mov EAX, [ESI + 4*EBX];
+ mul EAX, EAX;
+ shr CL, 1; // get carry
+ adc [EDI + 8*EBX], EAX;
+ adc [EDI + 8*EBX + 4], EDX;
+ setc CL; // save carry
+ inc EBX;
+ jnz L1;
+
+ pop ECX;
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+ }
+}
+
+@system unittest
+{
+ uint [] aa = new uint[13];
+ uint [] bb = new uint[6];
+ for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
+ for (int i=0; i<bb.length; ++i) bb[i] = i;
+ aa[$-1]= 7;
+ multibyteAddDiagonalSquares(aa[0..$-1], bb);
+ assert(aa[$-1]==7);
+ for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
+}
+
+void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure
+{
+ for (int i = 0; i < x.length-3; ++i)
+ {
+ dest[i+x.length] = multibyteMulAdd!('+')(
+ dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
+ }
+ ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
+ dest[$-5] = cast(uint) c;
+ c >>= 32;
+ c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
+ dest[$-4] = cast(uint) c;
+ c >>= 32;
+length2:
+ c += cast(ulong)(x[$-2]) * x[$-1];
+ dest[$-3] = cast(uint) c;
+ c >>= 32;
+ dest[$-2] = cast(uint) c;
+}
+
+//dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
+// assert(dest.length = src.length*2);
+// assert(src.length >= 3);
+void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure
+{
+ // Register usage
+ // EDX:EAX = used in multiply
+ // EBX = index
+ // ECX = carry1
+ // EBP = carry2
+ // EDI = end of dest for this pass through the loop. Index for outer loop.
+ // ESI = end of src. never changes
+ // [ESP] = M = src[i] = multiplier for this pass through the loop.
+ // dest.length is changed into dest.ptr+dest.length
+ version (D_PIC)
+ {
+ enum { zero = 0 }
+ }
+ else
+ {
+ // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+ // when initializing registers to zero.
+ __gshared int zero = 0;
+ // use p3/p4 units
+ __gshared int storagenop; // write-only
+ }
+
+ enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
+ asm pure nothrow {
+ naked;
+
+ push ESI;
+ push EDI;
+ align 16;
+ push EBX;
+ push EBP;
+ push EAX; // local variable M= src[i]
+ mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
+ mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
+ mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr
+
+ lea ESI, [ESI + 4*EBX]; // ESI = end of left
+ add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
+
+ // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
+ lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
+
+ lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
+ mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3]
+
+ cmp EBX, 3;
+ jz length_is_3;
+
+ // We start at src[1], not src[0].
+ dec EBX;
+ mov [ESP + LASTPARAM + 4*0], EBX;
+
+outer_loop:
+ mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
+ mov EBP, 0;
+ mov ECX, 0; // ECX = input carry.
+ dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
+ neg EBX; // count UP to zero.
+
+ mov EAX, [ESI + 4*EBX - 4*1]; // get new M
+ mov [ESP], EAX; // save new M
+
+ mov EAX, [ESI+4*EBX];
+ test EBX, 1;
+ jnz L_enter_odd;
+ }
+ // -- Inner loop, with even entry point
+ mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
+ asm pure nothrow {
+ mov [-4+EDI+4*EBX], EBP;
+ add EDI, 4;
+ cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
+ jnz outer_loop;
+length_is_3:
+ mov EAX, [ESI - 4*3];
+ mul EAX, [ESI - 4*2];
+ mov ECX, 0;
+ add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2]
+ adc ECX, EDX;
+
+ mov EAX, [ESI - 4*3];
+ mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
+ add EAX, ECX;
+ mov ECX, 0;
+ adc EDX, 0;
+ // now EDX: EAX = c + x[$-3] * x[$-1]
+ add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
+ adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row
+// do length == 2
+ mov EAX, [ESI - 4*2];
+ mul EAX, [ESI - 4*1];
+ add ECX, EAX;
+ adc EDX, 0;
+ mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
+ mov [EDI + 1*4], EDX;
+
+ pop EAX;
+ pop EBP;
+ pop EBX;
+ pop EDI;
+ pop ESI;
+ ret 4*4;
+ }
+L_enter_odd:
+ mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
+}
+
+@system unittest
+{
+ uint [] aa = new uint[200];
+ uint [] a = aa[0 .. 100];
+ uint [] b = new uint [100];
+ aa[] = 761;
+ a[] = 0;
+ b[] = 0;
+ a[3] = 6;
+ b[0]=1;
+ b[1] = 17;
+ b[50 .. 100]=78;
+ multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
+ uint [] c = new uint[100];
+ c[] = 0;
+ c[1] = 17;
+ c[3] = 6;
+ assert(a[]==c[]);
+ assert(a[0]==0);
+ aa[] = 0xFFFF_FFFF;
+ a[] = 0;
+ b[] = 0;
+ b[0]= 0xbf6a1f01;
+ b[1]= 0x6e38ed64;
+ b[2]= 0xdaa797ed;
+ b[3] = 0;
+
+ multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
+ assert(a[1]==0x3a600964);
+ assert(a[2]==0x339974f6);
+ assert(a[3]==0x46736fce);
+ assert(a[4]==0x5e24a2b4);
+
+ b[3] = 0xe93ff9f4;
+ b[4] = 0x184f03;
+ a[]=0;
+ multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
+ assert(a[3]==0x79fff5c2);
+ assert(a[4]==0xcf384241);
+ assert(a[5]== 0x4a17fc8);
+ assert(a[6]==0x4d549025);
+}
+
+
+void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure
+{
+ if (x.length < 4)
+ {
+ // Special cases, not worth doing triangular.
+ result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
+ multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
+ return;
+ }
+ // Do half a square multiply.
+ // dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
+ result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
+ multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
+ // Multiply by 2
+ result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
+ // And add the diagonal elements
+ result[0] = 0;
+ multibyteAddDiagonalSquares(result, x);
+}
+
+version (BignumPerformanceTest)
+{
+import core.stdc.stdio;
+int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
+
+__gshared uint [2200] X1;
+__gshared uint [2200] Y1;
+__gshared uint [4000] Z1;
+
+void testPerformance() pure
+{
+ // The performance results at the top of this file were obtained using
+ // a Windows device driver to access the CPU performance counters.
+ // The code below is less accurate but more widely usable.
+ // The value for division is quite inconsistent.
+ for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
+ int t, t0;
+ multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
+ t0 = clock();
+ multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
+ t = clock();
+ multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
+ auto shltime = (clock() - t) - (t - t0);
+ t0 = clock();
+ multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
+ t = clock();
+ multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
+ auto shrtime = (clock() - t) - (t - t0);
+ t0 = clock();
+ multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
+ t = clock();
+ multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
+ auto addtime = (clock() - t) - (t-t0);
+ t0 = clock();
+ multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
+ t = clock();
+ multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
+ auto multime = (clock() - t) - (t - t0);
+ multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
+ t0 = clock();
+ multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
+ t = clock();
+ multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
+ auto muladdtime = (clock() - t) - (t - t0);
+ multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
+ t = clock();
+ multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
+ auto accumtime = clock() - t;
+ t0 = clock();
+ multibyteDivAssign(Z1[0 .. 2000], 217, 0);
+ t = clock();
+ multibyteDivAssign(Z1[0 .. 1000], 37, 0);
+ auto divtime = (t - t0) - (clock() - t);
+ t= clock();
+ multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
+ auto squaretime = clock() - t;
+
+ printf("-- BigInt asm performance (cycles/int) --\n");
+ printf("Add: %.2f\n", addtime/1000.0);
+ printf("Shl: %.2f\n", shltime/1000.0);
+ printf("Shr: %.2f\n", shrtime/1000.0);
+ printf("Mul: %.2f\n", multime/1000.0);
+ printf("MulAdd: %.2f\n", muladdtime/1000.0);
+ printf("Div: %.2f\n", divtime/1000.0);
+ printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
+ printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
+}
+
+static this()
+{
+ testPerformance();
+}
+}
+
+} // version (D_InlineAsm_X86)
diff --git a/libphobos/src/std/internal/math/errorfunction.d b/libphobos/src/std/internal/math/errorfunction.d
new file mode 100644
index 0000000..4012e64
--- /dev/null
+++ b/libphobos/src/std/internal/math/errorfunction.d
@@ -0,0 +1,1145 @@
+/**
+ * Error Functions and Normal Distribution.
+ *
+ * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
+ * Copyright: Based on the CEPHES math library, which is
+ * Copyright (C) 1994 Stephen L. Moshier (moshier@world.std.com).
+ * Authors: Stephen L. Moshier, ported to D by Don Clugston and David Nadlinger
+ */
+/**
+ * Macros:
+ * NAN = $(RED NAN)
+ * SUP = <span style="vertical-align:super;font-size:smaller">$0</span>
+ * GAMMA = &#915;
+ * INTEGRAL = &#8747;
+ * INTEGRATE = $(BIG &#8747;<sub>$(SMALL $1)</sub><sup>$2</sup>)
+ * POWER = $1<sup>$2</sup>
+ * BIGSUM = $(BIG &Sigma; <sup>$2</sup><sub>$(SMALL $1)</sub>)
+ * CHOOSE = $(BIG &#40;) <sup>$(SMALL $1)</sup><sub>$(SMALL $2)</sub> $(BIG &#41;)
+ * TABLE_SV = <table border="1" cellpadding="4" cellspacing="0">
+ * <caption>Special Values</caption>
+ * $0</table>
+ * SVH = $(TR $(TH $1) $(TH $2))
+ * SV = $(TR $(TD $1) $(TD $2))
+ */
+module std.internal.math.errorfunction;
+import std.math;
+
+pure:
+nothrow:
+@safe:
+@nogc:
+
+private {
+immutable real EXP_2 = 0.135335283236612691893999494972484403L; /* exp(-2) */
+enum real SQRT2PI = 2.50662827463100050241576528481104525L; // sqrt(2pi)
+
+
+enum real MAXLOG = 0x1.62e42fefa39ef358p+13L; // log(real.max)
+enum real MINLOG = -0x1.6436716d5406e6d8p+13L; // log(real.min*real.epsilon) = log(smallest denormal)
+}
+
+T rationalPoly(T)(T x, const(T) [] numerator, const(T) [] denominator) pure nothrow
+{
+ return poly(x, numerator)/poly(x, denominator);
+}
+
+
+private {
+
+/* erfc(x) = exp(-x^2) P(1/x)/Q(1/x)
+ 1/8 <= 1/x <= 1
+ Peak relative error 5.8e-21 */
+immutable real[10] P = [ -0x1.30dfa809b3cc6676p-17, 0x1.38637cd0913c0288p+18,
+ 0x1.2f015e047b4476bp+22, 0x1.24726f46aa9ab08p+25, 0x1.64b13c6395dc9c26p+27,
+ 0x1.294c93046ad55b5p+29, 0x1.5962a82f92576dap+30, 0x1.11a709299faba04ap+31,
+ 0x1.11028065b087be46p+31, 0x1.0d8ef40735b097ep+30
+];
+
+immutable real[11] Q = [ 0x1.14d8e2a72dec49f4p+19, 0x1.0c880ff467626e1p+23,
+ 0x1.04417ef060b58996p+26, 0x1.404e61ba86df4ebap+28, 0x1.0f81887bc82b873ap+30,
+ 0x1.4552a5e39fb49322p+31, 0x1.11779a0ceb2a01cep+32, 0x1.3544dd691b5b1d5cp+32,
+ 0x1.a91781f12251f02ep+31, 0x1.0d8ef3da605a1c86p+30, 1.0
+];
+
+// For 128 bit quadruple-precision floats, we use a higher-precision implementation
+// with more polynomial segments.
+enum isIEEEQuadruple = floatTraits!real.realFormat == RealFormat.ieeeQuadruple;
+static if (isIEEEQuadruple)
+{
+ // erfc(x + 0.25) = erfc(0.25) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 1.4e-35
+ immutable real[9] RNr13 = [
+ -2.353707097641280550282633036456457014829E3L,
+ 3.871159656228743599994116143079870279866E2L,
+ -3.888105134258266192210485617504098426679E2L,
+ -2.129998539120061668038806696199343094971E1L,
+ -8.125462263594034672468446317145384108734E1L,
+ 8.151549093983505810118308635926270319660E0L,
+ -5.033362032729207310462422357772568553670E0L,
+ -4.253956621135136090295893547735851168471E-2L,
+ -8.098602878463854789780108161581050357814E-2L
+ ];
+ immutable real[9] RDr13 = [
+ 2.220448796306693503549505450626652881752E3L,
+ 1.899133258779578688791041599040951431383E2L,
+ 1.061906712284961110196427571557149268454E3L,
+ 7.497086072306967965180978101974566760042E1L,
+ 2.146796115662672795876463568170441327274E2L,
+ 1.120156008362573736664338015952284925592E1L,
+ 2.211014952075052616409845051695042741074E1L,
+ 6.469655675326150785692908453094054988938E-1L,
+ 1.0
+ ];
+
+ // erfc(0.25) = C13a + C13b to extra precision.
+ immutable real C13a = 0.723663330078125L;
+ immutable real C13b = 1.0279753638067014931732235184287934646022E-5L;
+
+ // erfc(x + 0.375) = erfc(0.375) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 1.2e-35
+ immutable real[9] RNr14 = [
+ -2.446164016404426277577283038988918202456E3L,
+ 6.718753324496563913392217011618096698140E2L,
+ -4.581631138049836157425391886957389240794E2L,
+ -2.382844088987092233033215402335026078208E1L,
+ -7.119237852400600507927038680970936336458E1L,
+ 1.313609646108420136332418282286454287146E1L,
+ -6.188608702082264389155862490056401365834E0L,
+ -2.787116601106678287277373011101132659279E-2L,
+ -2.230395570574153963203348263549700967918E-2L
+ ];
+ immutable real[9] RDr14 = [
+ 2.495187439241869732696223349840963702875E3L,
+ 2.503549449872925580011284635695738412162E2L,
+ 1.159033560988895481698051531263861842461E3L,
+ 9.493751466542304491261487998684383688622E1L,
+ 2.276214929562354328261422263078480321204E2L,
+ 1.367697521219069280358984081407807931847E1L,
+ 2.276988395995528495055594829206582732682E1L,
+ 7.647745753648996559837591812375456641163E-1L,
+ 1.0
+ ];
+
+ // erfc(0.375) = C14a + C14b to extra precision.
+ immutable real C14a = 0.5958709716796875L;
+ immutable real C14b = 1.2118885490201676174914080878232469565953E-5L;
+
+ // erfc(x + 0.5) = erfc(0.5) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 4.7e-36
+ immutable real[9] RNr15 = [
+ -2.624212418011181487924855581955853461925E3L,
+ 8.473828904647825181073831556439301342756E2L,
+ -5.286207458628380765099405359607331669027E2L,
+ -3.895781234155315729088407259045269652318E1L,
+ -6.200857908065163618041240848728398496256E1L,
+ 1.469324610346924001393137895116129204737E1L,
+ -6.961356525370658572800674953305625578903E0L,
+ 5.145724386641163809595512876629030548495E-3L,
+ 1.990253655948179713415957791776180406812E-2L
+ ];
+ immutable real[9] RDr15 = [
+ 2.986190760847974943034021764693341524962E3L,
+ 5.288262758961073066335410218650047725985E2L,
+ 1.363649178071006978355113026427856008978E3L,
+ 1.921707975649915894241864988942255320833E2L,
+ 2.588651100651029023069013885900085533226E2L,
+ 2.628752920321455606558942309396855629459E1L,
+ 2.455649035885114308978333741080991380610E1L,
+ 1.378826653595128464383127836412100939126E0L,
+ 1.0
+ ];
+ // erfc(0.5) = C15a + C15b to extra precision.
+ immutable real C15a = 0.4794921875L;
+ immutable real C15b = 7.9346869534623172533461080354712635484242E-6L;
+
+ // erfc(x + 0.625) = erfc(0.625) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 5.1e-36
+ immutable real[9] RNr16 = [
+ -2.347887943200680563784690094002722906820E3L,
+ 8.008590660692105004780722726421020136482E2L,
+ -5.257363310384119728760181252132311447963E2L,
+ -4.471737717857801230450290232600243795637E1L,
+ -4.849540386452573306708795324759300320304E1L,
+ 1.140885264677134679275986782978655952843E1L,
+ -6.731591085460269447926746876983786152300E0L,
+ 1.370831653033047440345050025876085121231E-1L,
+ 2.022958279982138755020825717073966576670E-2L,
+ ];
+ immutable real[9] RDr16 = [
+ 3.075166170024837215399323264868308087281E3L,
+ 8.730468942160798031608053127270430036627E2L,
+ 1.458472799166340479742581949088453244767E3L,
+ 3.230423687568019709453130785873540386217E2L,
+ 2.804009872719893612081109617983169474655E2L,
+ 4.465334221323222943418085830026979293091E1L,
+ 2.612723259683205928103787842214809134746E1L,
+ 2.341526751185244109722204018543276124997E0L,
+ 1.0
+ ];
+ // erfc(0.625) = C16a + C16b to extra precision.
+ immutable real C16a = 0.3767547607421875L;
+ immutable real C16b = 4.3570693945275513594941232097252997287766E-6L;
+
+ // erfc(x + 0.75) = erfc(0.75) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 1.7e-35
+ immutable real[9] RNr17 = [
+ -1.767068734220277728233364375724380366826E3L,
+ 6.693746645665242832426891888805363898707E2L,
+ -4.746224241837275958126060307406616817753E2L,
+ -2.274160637728782675145666064841883803196E1L,
+ -3.541232266140939050094370552538987982637E1L,
+ 6.988950514747052676394491563585179503865E0L,
+ -5.807687216836540830881352383529281215100E0L,
+ 3.631915988567346438830283503729569443642E-1L,
+ -1.488945487149634820537348176770282391202E-2L
+ ];
+ immutable real[9] RDr17 = [
+ 2.748457523498150741964464942246913394647E3L,
+ 1.020213390713477686776037331757871252652E3L,
+ 1.388857635935432621972601695296561952738E3L,
+ 3.903363681143817750895999579637315491087E2L,
+ 2.784568344378139499217928969529219886578E2L,
+ 5.555800830216764702779238020065345401144E1L,
+ 2.646215470959050279430447295801291168941E1L,
+ 2.984905282103517497081766758550112011265E0L,
+ 1.0
+ ];
+ // erfc(0.75) = C17a + C17b to extra precision.
+ immutable real C17a = 0.2888336181640625L;
+ immutable real C17b = 1.0748182422368401062165408589222625794046E-5L;
+
+
+ // erfc(x + 0.875) = erfc(0.875) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 2.2e-35
+ immutable real[9] RNr18 = [
+ -1.342044899087593397419622771847219619588E3L,
+ 6.127221294229172997509252330961641850598E2L,
+ -4.519821356522291185621206350470820610727E2L,
+ 1.223275177825128732497510264197915160235E1L,
+ -2.730789571382971355625020710543532867692E1L,
+ 4.045181204921538886880171727755445395862E0L,
+ -4.925146477876592723401384464691452700539E0L,
+ 5.933878036611279244654299924101068088582E-1L,
+ -5.557645435858916025452563379795159124753E-2L
+ ];
+ immutable real[9] RDr18 = [
+ 2.557518000661700588758505116291983092951E3L,
+ 1.070171433382888994954602511991940418588E3L,
+ 1.344842834423493081054489613250688918709E3L,
+ 4.161144478449381901208660598266288188426E2L,
+ 2.763670252219855198052378138756906980422E2L,
+ 5.998153487868943708236273854747564557632E1L,
+ 2.657695108438628847733050476209037025318E1L,
+ 3.252140524394421868923289114410336976512E0L,
+ 1.0
+ ];
+
+ // erfc(0.875) = C18a + C18b to extra precision.
+ immutable real C18a = 0.215911865234375L;
+ immutable real C18b = 1.3073705765341685464282101150637224028267E-5L;
+
+ // erfc(x + 1.0) = erfc(1.0) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 1.6e-35
+ immutable real[9] RNr19 = [
+ -1.139180936454157193495882956565663294826E3L,
+ 6.134903129086899737514712477207945973616E2L,
+ -4.628909024715329562325555164720732868263E2L,
+ 4.165702387210732352564932347500364010833E1L,
+ -2.286979913515229747204101330405771801610E1L,
+ 1.870695256449872743066783202326943667722E0L,
+ -4.177486601273105752879868187237000032364E0L,
+ 7.533980372789646140112424811291782526263E-1L,
+ -8.629945436917752003058064731308767664446E-2L
+ ];
+ immutable real[9] RDr19 = [
+ 2.744303447981132701432716278363418643778E3L,
+ 1.266396359526187065222528050591302171471E3L,
+ 1.466739461422073351497972255511919814273E3L,
+ 4.868710570759693955597496520298058147162E2L,
+ 2.993694301559756046478189634131722579643E2L,
+ 6.868976819510254139741559102693828237440E1L,
+ 2.801505816247677193480190483913753613630E1L,
+ 3.604439909194350263552750347742663954481E0L,
+ 1.0
+ ];
+
+ // erfc(1.0) = C19a + C19b to extra precision.
+ immutable real C19a = 0.15728759765625L;
+ immutable real C19b = 1.1609394035130658779364917390740703933002E-5L;
+
+ // erfc(x + 1.125) = erfc(1.125) + x R(x)
+ // 0 <= x < 0.125
+ // Peak relative error 3.6e-36
+ immutable real[9] RNr20 = [
+ -9.652706916457973956366721379612508047640E2L,
+ 5.577066396050932776683469951773643880634E2L,
+ -4.406335508848496713572223098693575485978E2L,
+ 5.202893466490242733570232680736966655434E1L,
+ -1.931311847665757913322495948705563937159E1L,
+ -9.364318268748287664267341457164918090611E-2L,
+ -3.306390351286352764891355375882586201069E0L,
+ 7.573806045289044647727613003096916516475E-1L,
+ -9.611744011489092894027478899545635991213E-2L
+ ];
+ immutable real[9] RDr20 = [
+ 3.032829629520142564106649167182428189014E3L,
+ 1.659648470721967719961167083684972196891E3L,
+ 1.703545128657284619402511356932569292535E3L,
+ 6.393465677731598872500200253155257708763E2L,
+ 3.489131397281030947405287112726059221934E2L,
+ 8.848641738570783406484348434387611713070E1L,
+ 3.132269062552392974833215844236160958502E1L,
+ 4.430131663290563523933419966185230513168E0L,
+ 1.0
+ ];
+
+ // erfc(1.125) = C20a + C20b to extra precision.
+ immutable real C20a = 0.111602783203125L;
+ immutable real C20b = 8.9850951672359304215530728365232161564636E-6L;
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 7/8 <= 1/x < 1
+ // Peak relative error 1.4e-35
+ immutable real[10] RNr8 = [
+ 3.587451489255356250759834295199296936784E1L,
+ 5.406249749087340431871378009874875889602E2L,
+ 2.931301290625250886238822286506381194157E3L,
+ 7.359254185241795584113047248898753470923E3L,
+ 9.201031849810636104112101947312492532314E3L,
+ 5.749697096193191467751650366613289284777E3L,
+ 1.710415234419860825710780802678697889231E3L,
+ 2.150753982543378580859546706243022719599E2L,
+ 8.740953582272147335100537849981160931197E0L,
+ 4.876422978828717219629814794707963640913E-2L
+ ];
+ immutable real[10] RDr8 = [
+ 6.358593134096908350929496535931630140282E1L,
+ 9.900253816552450073757174323424051765523E2L,
+ 5.642928777856801020545245437089490805186E3L,
+ 1.524195375199570868195152698617273739609E4L,
+ 2.113829644500006749947332935305800887345E4L,
+ 1.526438562626465706267943737310282977138E4L,
+ 5.561370922149241457131421914140039411782E3L,
+ 9.394035530179705051609070428036834496942E2L,
+ 6.147019596150394577984175188032707343615E1L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 3/4 <= 1/x < 7/8
+ // Peak relative error 1.7e-36
+ immutable real[10] RNr7 = [
+ 1.293515364043117601705535485785956592493E2L,
+ 2.474534371269189867053251150063459055230E3L,
+ 1.756537563959875738809491329250457486510E4L,
+ 5.977479535376639763773153344676726091607E4L,
+ 1.054313816139671870123172936972055385049E5L,
+ 9.754699773487726957401038094714603033904E4L,
+ 4.579131242577671038339922925213209214880E4L,
+ 1.000710322164510887997115157797717324370E4L,
+ 8.496863250712471449526805271633794700452E2L,
+ 1.797349831386892396933210199236530557333E1L
+ ];
+ immutable real[11] RDr7 = [
+ 2.292696320307033494820399866075534515002E2L,
+ 4.500632608295626968062258401895610053116E3L,
+ 3.321218723485498111535866988511716659339E4L,
+ 1.196084512221845156596781258490840961462E5L,
+ 2.287033883912529843927983406878910939930E5L,
+ 2.370223495794642027268482075021298394425E5L,
+ 1.305173734022437154610938308944995159199E5L,
+ 3.589386258485887630236490009835928559621E4L,
+ 4.339996864041074149726360516336773136101E3L,
+ 1.753135522665469574605384979152863899099E2L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 5/8 <= 1/x < 3/4
+ // Peak relative error 1.6e-35
+ immutable real[10] RNr6 = [
+ 1.423313561367394923305025174137639124533E1L,
+ 3.244462503273003837514629113846075327206E2L,
+ 2.784937282403293364911673341412846781934E3L,
+ 1.163060685810874867196849890286455473382E4L,
+ 2.554141394931962276102205517358731053756E4L,
+ 2.982733782500729530503336931258698708782E4L,
+ 1.789683564523810605328169719436374742840E4L,
+ 5.056032142227470121262177112822018882754E3L,
+ 5.605349942234782054561269306895707034586E2L,
+ 1.561652599080729507274832243665726064881E1L
+ ];
+ immutable real[11] RDr6 = [
+ 2.522757606611479946069351519410222913326E1L,
+ 5.876797910931896554014229647006604017806E2L,
+ 5.211092128250480712011248211246144751074E3L,
+ 2.282679910855404599271496827409168580797E4L,
+ 5.371245819205596609986320599133109262447E4L,
+ 6.926186102106400355114925675028888924445E4L,
+ 4.794366033363621432575096487724913414473E4L,
+ 1.673190682734065914573814938835674963896E4L,
+ 2.589544846151313120096957014256536236242E3L,
+ 1.349438432583208276883323156200117027433E2L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 1/2 <= 1/x < 5/8
+ // Peak relative error 4.3e-36
+ immutable real[11] RNr5 = [
+ 6.743447478279267339131211137241149796763E-2L,
+ 2.031339015932962998168472743355874796350E0L,
+ 2.369234815713196480221800940201367484379E1L,
+ 1.387819614016107433603101545594790875922E2L,
+ 4.435600256936515839937720907171966121786E2L,
+ 7.881577949936817507981170892417739733047E2L,
+ 7.615749099291545976179905281851765734680E2L,
+ 3.752484528663442467089606663006771157777E2L,
+ 8.279644286027145214308303292537009564726E1L,
+ 6.201462983413738162709722770960040042647E0L,
+ 6.649631608720062333043506249503378282697E-2L
+ ];
+ immutable real[11] RDr5 = [
+ 1.195244945161889822018178270706903972343E-1L,
+ 3.660216908153253021384862427197665991311E0L,
+ 4.373405883243078019655721779021995159854E1L,
+ 2.653305963056235008916733402786877121865E2L,
+ 8.921329790491152046318422124415895506335E2L,
+ 1.705552231555600759729260640562363304312E3L,
+ 1.832711109606893446763174603477244625325E3L,
+ 1.056823953275835649973998168744261083316E3L,
+ 2.975561981792909722126456997074344895584E2L,
+ 3.393149095158232521894537008472203487436E1L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 3/8 <= 1/x < 1/2
+ // Peak relative error 1.8e-36
+ immutable real[11] RNr4 = [
+ 3.558685919236420073872459554885612994007E-2L,
+ 1.460223642496950651561817195253277924528E0L,
+ 2.379856746555189546876720308841066577268E1L,
+ 2.005205521246554860334064698817220160117E2L,
+ 9.533374928664989955629120027419490517596E2L,
+ 2.623024576994438336130421711314560425373E3L,
+ 4.126446434603735586340585027628851620886E3L,
+ 3.540675861596687801829655387867654030013E3L,
+ 1.506037084891064572653273761987617394259E3L,
+ 2.630715699182706745867272452228891752353E2L,
+ 1.202476629652900619635409242749750364878E1L
+ ];
+ immutable real[12] RDr4 = [
+ 6.307606561714590590399683184410336583739E-2L,
+ 2.619717051134271249293056836082721776665E0L,
+ 4.344441402681725017630451522968410844608E1L,
+ 3.752891116408399440953195184301023399176E2L,
+ 1.849305988804654653921972804388006355502E3L,
+ 5.358505261991675891835885654499883449403E3L,
+ 9.091890995405251314631428721090705475825E3L,
+ 8.731418313949291797856351745278287516416E3L,
+ 4.420211285043270337492325400764271868740E3L,
+ 1.031487363021856106882306509107923200832E3L,
+ 8.387036084846046121805145056040429461783E1L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 1/4 <= 1/x < 3/8
+ // Peak relative error 8.1e-37
+ immutable real[12] RNr3 = [
+ 4.584481542956275354582319313040418316755E-5L,
+ 2.674923158288848442110883948437930349128E-3L,
+ 6.344232532055212248017211243740466847311E-2L,
+ 7.985145965992002744933550450451513513963E-1L,
+ 5.845078061888281665064746347663123946270E0L,
+ 2.566625318816866587482759497608029522596E1L,
+ 6.736225182343446605268837827950856640948E1L,
+ 1.021796460139598089409347761712730512053E2L,
+ 8.344336615515430530929955615400706619764E1L,
+ 3.207749011528249356283897356277376306967E1L,
+ 4.386185123863412086856423971695142026036E0L,
+ 8.971576448581208351826868348023528863856E-2L
+ ];
+ immutable real[12] RDr3 = [
+ 8.125781965218112303281657065320409661370E-5L,
+ 4.781806762611504685247817818428945295520E-3L,
+ 1.147785538413798317790357996845767614561E-1L,
+ 1.469285552007088106614218996464752307606E0L,
+ 1.101712261349880339221039938999124077650E1L,
+ 5.008507527095093413713171655268276861071E1L,
+ 1.383058691613468970486425146336829447433E2L,
+ 2.264114250278912520501010108736339599752E2L,
+ 2.081377197698598680576330179979996940039E2L,
+ 9.724438129690013609440151781601781137944E1L,
+ 1.907905050771832372089975877589291760121E1L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 1/8 <= 1/x < 1/4
+ // Peak relative error 1.5e-36
+ immutable real[12] RNr2 = [
+ 6.928615158005256885698045840589513728399E-7L,
+ 5.616245938942075826026382337922413007879E-5L,
+ 1.871624980715261794832358438894219696113E-3L,
+ 3.349922063795792371642023765253747563009E-2L,
+ 3.531865233974349943956345502463135695834E-1L,
+ 2.264714157625072773976468825160906342360E0L,
+ 8.810720294489253776747319730638214883026E0L,
+ 2.014056685571655833019183248931442888437E1L,
+ 2.524586947657190747039554310814128743320E1L,
+ 1.520656940937208886246188940244581671609E1L,
+ 3.334145500790963675035841482334493680498E0L,
+ 1.122108380007109245896534245151140632457E-1L
+ ];
+ immutable real[12] RDr2 = [
+ 1.228065061824874795984937092427781089256E-6L,
+ 1.001593999520159167559129042893802235969E-4L,
+ 3.366527555699367241421450749821030974446E-3L,
+ 6.098626947195865254152265585991861150369E-2L,
+ 6.541547922508613985813189387198804660235E-1L,
+ 4.301130233305371976727117480925676583204E0L,
+ 1.737155892350891711527711121692994762909E1L,
+ 4.206892112110558214680649401236873828801E1L,
+ 5.787487996025016843403524261574779631219E1L,
+ 4.094047148590822715163181507813774861621E1L,
+ 1.230603930056944875836549716515643997094E1L,
+ 1.0L
+ ];
+
+ // erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ // 1/128 <= 1/x < 1/8
+ // Peak relative error 2.2e-36
+ immutable real[10] RNr1 = [
+ 1.293111801138199795250035229383033322539E-6L,
+ 9.785224720880980456171438759161161816706E-5L,
+ 2.932474396233212166056331430621176065943E-3L,
+ 4.496429595719847083917435337780697436921E-2L,
+ 3.805989855927720844877478869846718877846E-1L,
+ 1.789745532222426292126781724570152590071E0L,
+ 4.465737379634389318903237306594171764628E0L,
+ 5.268535822258082278401240171488850433767E0L,
+ 2.258357766807433839494276681092713991651E0L,
+ 1.504459334078750002966538036652860809497E-1L
+ ];
+ immutable real[10] RDr1 = [
+ 2.291980991578770070179177302906728371406E-6L,
+ 1.745845828808028552029674694534934620384E-4L,
+ 5.283248841982102317072923869576785278019E-3L,
+ 8.221212297078141470944454807434634848018E-2L,
+ 7.120500674861902950423510939060230945621E-1L,
+ 3.475435367560809622183983439133664598155E0L,
+ 9.243253391989233533874386043611304387113E0L,
+ 1.227894792475280941511758877318903197188E1L,
+ 6.789361410398841316638617624392719077724E0L,
+ 1.0L
+ ];
+
+ // erf(z+1) = erfConst + P(z)/Q(z)
+ // -.125 <= z <= 0
+ // Peak relative error 7.3e-36
+ immutable real erfConst = 0.845062911510467529296875L;
+ immutable real[9] TN2 = [
+ -4.088889697077485301010486931817357000235E1L,
+ 7.157046430681808553842307502826960051036E3L,
+ -2.191561912574409865550015485451373731780E3L,
+ 2.180174916555316874988981177654057337219E3L,
+ 2.848578658049670668231333682379720943455E2L,
+ 1.630362490952512836762810462174798925274E2L,
+ 6.317712353961866974143739396865293596895E0L,
+ 2.450441034183492434655586496522857578066E1L,
+ 5.127662277706787664956025545897050896203E-1L
+ ];
+ immutable real[10] TD2 = [
+ 1.731026445926834008273768924015161048885E4L,
+ 1.209682239007990370796112604286048173750E4L,
+ 1.160950290217993641320602282462976163857E4L,
+ 5.394294645127126577825507169061355698157E3L,
+ 2.791239340533632669442158497532521776093E3L,
+ 8.989365571337319032943005387378993827684E2L,
+ 2.974016493766349409725385710897298069677E2L,
+ 6.148192754590376378740261072533527271947E1L,
+ 1.178502892490738445655468927408440847480E1L,
+ 1.0L
+ ];
+
+ // erf(x) = x + x P(x^2)/Q(x^2)
+ // 0 <= x <= 7/8
+ // Peak relative error 1.8e-35
+ immutable real[9] TN1 = [
+ -3.858252324254637124543172907442106422373E10L,
+ 9.580319248590464682316366876952214879858E10L,
+ 1.302170519734879977595901236693040544854E10L,
+ 2.922956950426397417800321486727032845006E9L,
+ 1.764317520783319397868923218385468729799E8L,
+ 1.573436014601118630105796794840834145120E7L,
+ 4.028077380105721388745632295157816229289E5L,
+ 1.644056806467289066852135096352853491530E4L,
+ 3.390868480059991640235675479463287886081E1L
+ ];
+ immutable real[10] TD1 = [
+ -3.005357030696532927149885530689529032152E11L,
+ -1.342602283126282827411658673839982164042E11L,
+ -2.777153893355340961288511024443668743399E10L,
+ -3.483826391033531996955620074072768276974E9L,
+ -2.906321047071299585682722511260895227921E8L,
+ -1.653347985722154162439387878512427542691E7L,
+ -6.245520581562848778466500301865173123136E5L,
+ -1.402124304177498828590239373389110545142E4L,
+ -1.209368072473510674493129989468348633579E2L,
+ 1.0L
+ ];
+}
+else
+{
+ /* erfc(x) = exp(-x^2) 1/x R(1/x^2) / S(1/x^2)
+ 1/128 <= 1/x < 1/8
+ Peak relative error 1.9e-21 */
+ immutable real[5] R = [ 0x1.b9f6d8b78e22459ep-6, 0x1.1b84686b0a4ea43ap-1,
+ 0x1.b8f6aebe96000c2ap+1, 0x1.cb1dbedac27c8ec2p+2, 0x1.cf885f8f572a4c14p+1
+ ];
+
+ immutable real[6] S = [
+ 0x1.87ae3cae5f65eb5ep-5, 0x1.01616f266f306d08p+0, 0x1.a4abe0411eed6c22p+2,
+ 0x1.eac9ce3da600abaap+3, 0x1.5752a9ac2faebbccp+3, 1.0
+ ];
+
+ /* erf(x) = x P(x^2)/Q(x^2)
+ 0 <= x <= 1
+ Peak relative error 7.6e-23 */
+ immutable real[7] T = [ 0x1.0da01654d757888cp+20, 0x1.2eb7497bc8b4f4acp+17,
+ 0x1.79078c19530f72a8p+15, 0x1.4eaf2126c0b2c23p+11, 0x1.1f2ea81c9d272a2ep+8,
+ 0x1.59ca6e2d866e625p+2, 0x1.c188e0b67435faf4p-4
+ ];
+
+ immutable real[7] U = [ 0x1.dde6025c395ae34ep+19, 0x1.c4bc8b6235df35aap+18,
+ 0x1.8465900e88b6903ap+16, 0x1.855877093959ffdp+13, 0x1.e5c44395625ee358p+9,
+ 0x1.6a0fed103f1c68a6p+5, 1.0
+ ];
+}
+}
+
+/**
+ * Complementary error function
+ *
+ * erfc(x) = 1 - erf(x), and has high relative accuracy for
+ * values of x far from zero. (For values near zero, use erf(x)).
+ *
+ * 1 - erf(x) = 2/ $(SQRT)(&pi;)
+ * $(INTEGRAL x, $(INFINITY)) exp( - $(POWER t, 2)) dt
+ *
+ *
+ * For small x, erfc(x) = 1 - erf(x); otherwise rational
+ * approximations are computed.
+ *
+ * A special function expx2(x) is used to suppress error amplification
+ * in computing exp(-x^2).
+ */
+real erfc(real a)
+{
+ if (a == real.infinity)
+ return 0.0;
+ if (a == -real.infinity)
+ return 2.0;
+
+ immutable x = (a < 0.0) ? -a : a;
+
+ if (x < (isIEEEQuadruple ? 0.25 : 1.0))
+ return 1.0 - erf(a);
+
+ static if (isIEEEQuadruple)
+ {
+ if (x < 1.25)
+ {
+ real y;
+ final switch (cast(int)(8.0 * x))
+ {
+ case 2:
+ const z = x - 0.25;
+ y = C13b + z * rationalPoly(z, RNr13, RDr13);
+ y += C13a;
+ break;
+ case 3:
+ const z = x - 0.375;
+ y = C14b + z * rationalPoly(z, RNr14, RDr14);
+ y += C14a;
+ break;
+ case 4:
+ const z = x - 0.5;
+ y = C15b + z * rationalPoly(z, RNr15, RDr15);
+ y += C15a;
+ break;
+ case 5:
+ const z = x - 0.625;
+ y = C16b + z * rationalPoly(z, RNr16, RDr16);
+ y += C16a;
+ break;
+ case 6:
+ const z = x - 0.75;
+ y = C17b + z * rationalPoly(z, RNr17, RDr17);
+ y += C17a;
+ break;
+ case 7:
+ const z = x - 0.875;
+ y = C18b + z * rationalPoly(z, RNr18, RDr18);
+ y += C18a;
+ break;
+ case 8:
+ const z = x - 1.0;
+ y = C19b + z * rationalPoly(z, RNr19, RDr19);
+ y += C19a;
+ break;
+ case 9:
+ const z = x - 1.125;
+ y = C20b + z * rationalPoly(z, RNr20, RDr20);
+ y += C20a;
+ break;
+ }
+ if (a < 0.0)
+ y = 2.0 - y;
+ return y;
+ }
+ }
+
+ if (-a * a < -MAXLOG)
+ {
+ // underflow
+ if (a < 0.0) return 2.0;
+ else return 0.0;
+ }
+
+ real y;
+ immutable z = expx2(a, -1);
+
+ static if (isIEEEQuadruple)
+ {
+ y = z * erfce(x);
+ }
+ else
+ {
+ y = 1.0 / x;
+ if (x < 8.0)
+ y = z * rationalPoly(y, P, Q);
+ else
+ y = z * y * rationalPoly(y * y, R, S);
+ }
+
+ if (a < 0.0)
+ y = 2.0 - y;
+
+ if (y == 0.0)
+ {
+ // underflow
+ if (a < 0.0) return 2.0;
+ else return 0.0;
+ }
+
+ return y;
+}
+
+
+private {
+/* Exponentially scaled erfc function
+ exp(x^2) erfc(x)
+ valid for x > 1.
+ Use with normalDistribution and expx2. */
+static if (isIEEEQuadruple)
+{
+ real erfce(real x)
+ {
+ immutable z = 1.0L / (x * x);
+
+ real p;
+ switch (cast(int)(8.0 / x))
+ {
+ default:
+ case 0:
+ p = rationalPoly(z, RNr1, RDr1);
+ break;
+ case 1:
+ p = rationalPoly(z, RNr2, RDr2);
+ break;
+ case 2:
+ p = rationalPoly(z, RNr3, RDr3);
+ break;
+ case 3:
+ p = rationalPoly(z, RNr4, RDr4);
+ break;
+ case 4:
+ p = rationalPoly(z, RNr5, RDr5);
+ break;
+ case 5:
+ p = rationalPoly(z, RNr6, RDr6);
+ break;
+ case 6:
+ p = rationalPoly(z, RNr7, RDr7);
+ break;
+ case 7:
+ p = rationalPoly(z, RNr8, RDr8);
+ break;
+ }
+ return p / x;
+ }
+}
+else
+{
+ real erfce(real x)
+ {
+ real y = 1.0/x;
+
+ if (x < 8.0)
+ {
+ return rationalPoly(y, P, Q);
+ }
+ else
+ {
+ return y * rationalPoly(y * y, R, S);
+ }
+ }
+}
+}
+
+/**
+ * Error function
+ *
+ * The integral is
+ *
+ * erf(x) = 2/ $(SQRT)(&pi;)
+ * $(INTEGRAL 0, x) exp( - $(POWER t, 2)) dt
+ *
+ * The magnitude of x is limited to about 106.56 for IEEE 80-bit
+ * arithmetic; 1 or -1 is returned outside this range.
+ *
+ * For 0 <= |x| < 1, a rational polynomials are used; otherwise
+ * erf(x) = 1 - erfc(x).
+ *
+ * ACCURACY:
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,1 50000 2.0e-19 5.7e-20
+ */
+real erf(real x)
+{
+ if (x == 0.0)
+ return x; // deal with negative zero
+ if (x == -real.infinity)
+ return -1.0;
+ if (x == real.infinity)
+ return 1.0;
+ immutable ax = abs(x);
+ if (ax > 1.0L)
+ return 1.0L - erfc(x);
+
+ static if (isIEEEQuadruple)
+ {
+ immutable z = x * x;
+
+ real y;
+ if (ax < 0.875)
+ {
+ y = ax + ax * rationalPoly(x * x, TN1, TD1);
+ }
+ else
+ {
+ y = erfConst + rationalPoly(ax - 1.0L, TN2, TD2);
+ }
+
+ if (x < 0)
+ y = -y;
+ return y;
+ }
+ else
+ {
+ real z = x * x;
+ return x * rationalPoly(x * x, T, U);
+ }
+}
+
+@safe unittest
+{
+ // High resolution test points.
+ enum real erfc0_250 = 0.723663330078125 + 1.0279753638067014931732235184287934646022E-5;
+ enum real erfc0_375 = 0.5958709716796875 + 1.2118885490201676174914080878232469565953E-5;
+ enum real erfc0_500 = 0.4794921875 + 7.9346869534623172533461080354712635484242E-6;
+ enum real erfc0_625 = 0.3767547607421875 + 4.3570693945275513594941232097252997287766E-6;
+ enum real erfc0_750 = 0.2888336181640625 + 1.0748182422368401062165408589222625794046E-5;
+ enum real erfc0_875 = 0.215911865234375 + 1.3073705765341685464282101150637224028267E-5;
+ enum real erfc1_000 = 0.15728759765625 + 1.1609394035130658779364917390740703933002E-5;
+ enum real erfc1_125 = 0.111602783203125 + 8.9850951672359304215530728365232161564636E-6;
+
+ enum real erf0_875 = (1-0.215911865234375) - 1.3073705765341685464282101150637224028267E-5;
+
+ static bool isNaNWithPayload(real x, ulong payload) @safe pure nothrow @nogc
+ {
+ return isNaN(x) && getNaNPayload(x) == payload;
+ }
+
+ assert(feqrel(erfc(0.250L), erfc0_250 )>=real.mant_dig-1);
+ assert(feqrel(erfc(0.375L), erfc0_375 )>=real.mant_dig-0);
+ assert(feqrel(erfc(0.500L), erfc0_500 )>=real.mant_dig-2);
+ assert(feqrel(erfc(0.625L), erfc0_625 )>=real.mant_dig-1);
+ assert(feqrel(erfc(0.750L), erfc0_750 )>=real.mant_dig-1);
+ assert(feqrel(erfc(0.875L), erfc0_875 )>=real.mant_dig-4);
+ assert(feqrel(erfc(1.000L), erfc1_000 )>=real.mant_dig-2);
+ assert(feqrel(erfc(1.125L), erfc1_125 )>=real.mant_dig-2);
+ assert(feqrel(erf(0.875L), erf0_875 )>=real.mant_dig-1);
+ // The DMC implementation of erfc() fails this next test (just).
+ // Test point from Mathematica 11.0.
+ assert(feqrel(erfc(4.1L), 6.70002765408489837272673380763418472e-9L) >= real.mant_dig-5);
+
+ assert(isIdentical(erf(0.0),0.0));
+ assert(isIdentical(erf(-0.0),-0.0));
+ assert(erf(real.infinity) == 1.0);
+ assert(erf(-real.infinity) == -1.0);
+ assert(isNaNWithPayload(erf(NaN(0xDEF)), 0xDEF));
+ assert(isNaNWithPayload(erfc(NaN(0xDEF)), 0xDEF));
+ assert(isIdentical(erfc(real.infinity),0.0));
+ assert(erfc(-real.infinity) == 2.0);
+ assert(erfc(0) == 1.0);
+}
+
+/*
+ * Exponential of squared argument
+ *
+ * Computes y = exp(x*x) while suppressing error amplification
+ * that would ordinarily arise from the inexactness of the
+ * exponential argument x*x.
+ *
+ * If sign < 0, the result is inverted; i.e., y = exp(-x*x) .
+ *
+ * ACCURACY:
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE -106.566, 106.566 10^5 1.6e-19 4.4e-20
+ */
+
+real expx2(real x, int sign)
+{
+ /*
+ Cephes Math Library Release 2.9: June, 2000
+ Copyright 2000 by Stephen L. Moshier
+ */
+ const real M = 32_768.0;
+ const real MINV = 3.0517578125e-5L;
+
+ x = abs(x);
+ if (sign < 0)
+ x = -x;
+
+ /* Represent x as an exact multiple of M plus a residual.
+ M is a power of 2 chosen so that exp(m * m) does not overflow
+ or underflow and so that |x - m| is small. */
+ real m = MINV * floor(M * x + 0.5L);
+ real f = x - m;
+
+ /* x^2 = m^2 + 2mf + f^2 */
+ real u = m * m;
+ real u1 = 2 * m * f + f * f;
+
+ if (sign < 0)
+ {
+ u = -u;
+ u1 = -u1;
+ }
+
+ if ((u+u1) > MAXLOG)
+ return real.infinity;
+
+ /* u is exact, u1 is small. */
+ return exp(u) * exp(u1);
+}
+
+
+/*
+Computes the normal distribution function.
+
+The normal (or Gaussian, or bell-shaped) distribution is
+defined as:
+
+normalDist(x) = 1/$(SQRT) &pi; $(INTEGRAL -$(INFINITY), x) exp( - $(POWER t, 2)/2) dt
+ = 0.5 + 0.5 * erf(x/sqrt(2))
+ = 0.5 * erfc(- x/sqrt(2))
+
+To maintain accuracy at high values of x, use
+normalDistribution(x) = 1 - normalDistribution(-x).
+
+Accuracy:
+Within a few bits of machine resolution over the entire
+range.
+
+References:
+$(LINK http://www.netlib.org/cephes/ldoubdoc.html),
+G. Marsaglia, "Evaluating the Normal Distribution",
+Journal of Statistical Software <b>11</b>, (July 2004).
+*/
+real normalDistributionImpl(real a)
+{
+ real x = a * SQRT1_2;
+ real z = abs(x);
+
+ if ( z < 1.0 )
+ return 0.5L + 0.5L * erf(x);
+ else
+ {
+ real y = 0.5L * erfce(z);
+ /* Multiply by exp(-x^2 / 2) */
+ z = expx2(a, -1);
+ y = y * sqrt(z);
+ if ( x > 0.0L )
+ y = 1.0L - y;
+ return y;
+ }
+}
+
+@safe unittest
+{
+assert(fabs(normalDistributionImpl(1L) - (0.841344746068543))< 0.0000000000000005);
+assert(isIdentical(normalDistributionImpl(NaN(0x325)), NaN(0x325)));
+}
+
+/*
+ * Inverse of Normal distribution function
+ *
+ * Returns the argument, x, for which the area under the
+ * Normal probability density function (integrated from
+ * minus infinity to x) is equal to p.
+ *
+ * For small arguments 0 < p < exp(-2), the program computes
+ * z = sqrt( -2 log(p) ); then the approximation is
+ * x = z - log(z)/z - (1/z) P(1/z) / Q(1/z) .
+ * For larger arguments, x/sqrt(2 pi) = w + w^3 R(w^2)/S(w^2)) ,
+ * where w = p - 0.5.
+ */
+// TODO: isIEEEQuadruple (128 bit) real implementation; not available from CEPHES.
+real normalDistributionInvImpl(real p)
+in {
+ assert(p >= 0.0L && p <= 1.0L, "Domain error");
+}
+body
+{
+static immutable real[8] P0 =
+[ -0x1.758f4d969484bfdcp-7, 0x1.53cee17a59259dd2p-3,
+ -0x1.ea01e4400a9427a2p-1, 0x1.61f7504a0105341ap+1, -0x1.09475a594d0399f6p+2,
+ 0x1.7c59e7a0df99e3e2p+1, -0x1.87a81da52edcdf14p-1, 0x1.1fb149fd3f83600cp-7
+];
+
+static immutable real[8] Q0 =
+[ -0x1.64b92ae791e64bb2p-7, 0x1.7585c7d597298286p-3,
+ -0x1.40011be4f7591ce6p+0, 0x1.1fc067d8430a425ep+2, -0x1.21008ffb1e7ccdf2p+3,
+ 0x1.3d1581cf9bc12fccp+3, -0x1.53723a89fd8f083cp+2, 1.0
+];
+
+static immutable real[10] P1 =
+[ 0x1.20ceea49ea142f12p-13, 0x1.cbe8a7267aea80bp-7,
+ 0x1.79fea765aa787c48p-2, 0x1.d1f59faa1f4c4864p+1, 0x1.1c22e426a013bb96p+4,
+ 0x1.a8675a0c51ef3202p+5, 0x1.75782c4f83614164p+6, 0x1.7a2f3d90948f1666p+6,
+ 0x1.5cd116ee4c088c3ap+5, 0x1.1361e3eb6e3cc20ap+2
+];
+
+static immutable real[10] Q1 =
+[ 0x1.3a4ce1406cea98fap-13, 0x1.f45332623335cda2p-7,
+ 0x1.98f28bbd4b98db1p-2, 0x1.ec3b24f9c698091cp+1, 0x1.1cc56ecda7cf58e4p+4,
+ 0x1.92c6f7376bf8c058p+5, 0x1.4154c25aa47519b4p+6, 0x1.1b321d3b927849eap+6,
+ 0x1.403a5f5a4ce7b202p+4, 1.0
+];
+
+static immutable real[8] P2 =
+[ 0x1.8c124a850116a6d8p-21, 0x1.534abda3c2fb90bap-13,
+ 0x1.29a055ec93a4718cp-7, 0x1.6468e98aad6dd474p-3, 0x1.3dab2ef4c67a601cp+0,
+ 0x1.e1fb3a1e70c67464p+1, 0x1.b6cce8035ff57b02p+2, 0x1.9f4c9e749ff35f62p+1
+];
+
+static immutable real[8] Q2 =
+[ 0x1.af03f4fc0655e006p-21, 0x1.713192048d11fb2p-13,
+ 0x1.4357e5bbf5fef536p-7, 0x1.7fdac8749985d43cp-3, 0x1.4a080c813a2d8e84p+0,
+ 0x1.c3a4b423cdb41bdap+1, 0x1.8160694e24b5557ap+2, 1.0
+];
+
+static immutable real[8] P3 =
+[ -0x1.55da447ae3806168p-34, -0x1.145635641f8778a6p-24,
+ -0x1.abf46d6b48040128p-17, -0x1.7da550945da790fcp-11, -0x1.aa0b2a31157775fap-8,
+ 0x1.b11d97522eed26bcp-3, 0x1.1106d22f9ae89238p+1, 0x1.029a358e1e630f64p+1
+];
+
+static immutable real[8] Q3 =
+[ -0x1.74022dd5523e6f84p-34, -0x1.2cb60d61e29ee836p-24,
+ -0x1.d19e6ec03a85e556p-17, -0x1.9ea2a7b4422f6502p-11, -0x1.c54b1e852f107162p-8,
+ 0x1.e05268dd3c07989ep-3, 0x1.239c6aff14afbf82p+1, 1.0
+];
+
+ if (p <= 0.0L || p >= 1.0L)
+ {
+ if (p == 0.0L)
+ return -real.infinity;
+ if ( p == 1.0L )
+ return real.infinity;
+ return real.nan; // domain error
+ }
+ int code = 1;
+ real y = p;
+ if ( y > (1.0L - EXP_2) )
+ {
+ y = 1.0L - y;
+ code = 0;
+ }
+
+ real x, z, y2, x0, x1;
+
+ if ( y > EXP_2 )
+ {
+ y = y - 0.5L;
+ y2 = y * y;
+ x = y + y * (y2 * rationalPoly( y2, P0, Q0));
+ return x * SQRT2PI;
+ }
+
+ x = sqrt( -2.0L * log(y) );
+ x0 = x - log(x)/x;
+ z = 1.0L/x;
+ if ( x < 8.0L )
+ {
+ x1 = z * rationalPoly( z, P1, Q1);
+ }
+ else if ( x < 32.0L )
+ {
+ x1 = z * rationalPoly( z, P2, Q2);
+ }
+ else
+ {
+ x1 = z * rationalPoly( z, P3, Q3);
+ }
+ x = x0 - x1;
+ if ( code != 0 )
+ {
+ x = -x;
+ }
+ return x;
+}
+
+
+@safe unittest
+{
+ // TODO: Use verified test points.
+ // The values below are from Excel 2003.
+ assert(fabs(normalDistributionInvImpl(0.001) - (-3.09023230616779))< 0.00000000000005);
+ assert(fabs(normalDistributionInvImpl(1e-50) - (-14.9333375347885))< 0.00000000000005);
+ assert(feqrel(normalDistributionInvImpl(0.999), -normalDistributionInvImpl(0.001)) > real.mant_dig-6);
+
+ // Excel 2003 gets all the following values wrong!
+ assert(normalDistributionInvImpl(0.0) == -real.infinity);
+ assert(normalDistributionInvImpl(1.0) == real.infinity);
+ assert(normalDistributionInvImpl(0.5) == 0);
+ // (Excel 2003 returns norminv(p) = -30 for all p < 1e-200).
+ // The value tested here is the one the function returned in Jan 2006.
+ real unknown1 = normalDistributionInvImpl(1e-250L);
+ assert( fabs(unknown1 -(-33.79958617269L) ) < 0.00000005);
+}
diff --git a/libphobos/src/std/internal/math/gammafunction.d b/libphobos/src/std/internal/math/gammafunction.d
new file mode 100644
index 0000000..dd20691
--- /dev/null
+++ b/libphobos/src/std/internal/math/gammafunction.d
@@ -0,0 +1,1834 @@
+/**
+ * Implementation of the gamma and beta functions, and their integrals.
+ *
+ * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
+ * Copyright: Based on the CEPHES math library, which is
+ * Copyright (C) 1994 Stephen L. Moshier (moshier@world.std.com).
+ * Authors: Stephen L. Moshier (original C code). Conversion to D by Don Clugston
+ *
+ *
+Macros:
+ * TABLE_SV = <table border="1" cellpadding="4" cellspacing="0">
+ * <caption>Special Values</caption>
+ * $0</table>
+ * SVH = $(TR $(TH $1) $(TH $2))
+ * SV = $(TR $(TD $1) $(TD $2))
+ * GAMMA = &#915;
+ * INTEGRATE = $(BIG &#8747;<sub>$(SMALL $1)</sub><sup>$2</sup>)
+ * POWER = $1<sup>$2</sup>
+ * NAN = $(RED NAN)
+ */
+module std.internal.math.gammafunction;
+import std.internal.math.errorfunction;
+import std.math;
+
+pure:
+nothrow:
+@safe:
+@nogc:
+
+private {
+
+enum real SQRT2PI = 2.50662827463100050242E0L; // sqrt(2pi)
+immutable real EULERGAMMA = 0.57721_56649_01532_86060_65120_90082_40243_10421_59335_93992L; /** Euler-Mascheroni constant 0.57721566.. */
+
+// Polynomial approximations for gamma and loggamma.
+
+immutable real[8] GammaNumeratorCoeffs = [ 1.0,
+ 0x1.acf42d903366539ep-1, 0x1.73a991c8475f1aeap-2, 0x1.c7e918751d6b2a92p-4,
+ 0x1.86d162cca32cfe86p-6, 0x1.0c378e2e6eaf7cd8p-8, 0x1.dc5c66b7d05feb54p-12,
+ 0x1.616457b47e448694p-15
+];
+
+immutable real[9] GammaDenominatorCoeffs = [ 1.0,
+ 0x1.a8f9faae5d8fc8bp-2, -0x1.cb7895a6756eebdep-3, -0x1.7b9bab006d30652ap-5,
+ 0x1.c671af78f312082ep-6, -0x1.a11ebbfaf96252dcp-11, -0x1.447b4d2230a77ddap-10,
+ 0x1.ec1d45bb85e06696p-13,-0x1.d4ce24d05bd0a8e6p-17
+];
+
+immutable real[9] GammaSmallCoeffs = [ 1.0,
+ 0x1.2788cfc6fb618f52p-1, -0x1.4fcf4026afa2f7ecp-1, -0x1.5815e8fa24d7e306p-5,
+ 0x1.5512320aea2ad71ap-3, -0x1.59af0fb9d82e216p-5, -0x1.3b4b61d3bfdf244ap-7,
+ 0x1.d9358e9d9d69fd34p-8, -0x1.38fc4bcbada775d6p-10
+];
+
+immutable real[9] GammaSmallNegCoeffs = [ -1.0,
+ 0x1.2788cfc6fb618f54p-1, 0x1.4fcf4026afa2bc4cp-1, -0x1.5815e8fa2468fec8p-5,
+ -0x1.5512320baedaf4b6p-3, -0x1.59af0fa283baf07ep-5, 0x1.3b4a70de31e05942p-7,
+ 0x1.d9398be3bad13136p-8, 0x1.291b73ee05bcbba2p-10
+];
+
+immutable real[7] logGammaStirlingCoeffs = [
+ 0x1.5555555555553f98p-4, -0x1.6c16c16c07509b1p-9, 0x1.a01a012461cbf1e4p-11,
+ -0x1.3813089d3f9d164p-11, 0x1.b911a92555a277b8p-11, -0x1.ed0a7b4206087b22p-10,
+ 0x1.402523859811b308p-8
+];
+
+immutable real[7] logGammaNumerator = [
+ -0x1.0edd25913aaa40a2p+23, -0x1.31c6ce2e58842d1ep+24, -0x1.f015814039477c3p+23,
+ -0x1.74ffe40c4b184b34p+22, -0x1.0d9c6d08f9eab55p+20, -0x1.54c6b71935f1fc88p+16,
+ -0x1.0e761b42932b2aaep+11
+];
+
+immutable real[8] logGammaDenominator = [
+ -0x1.4055572d75d08c56p+24, -0x1.deeb6013998e4d76p+24, -0x1.106f7cded5dcc79ep+24,
+ -0x1.25e17184848c66d2p+22, -0x1.301303b99a614a0ap+19, -0x1.09e76ab41ae965p+15,
+ -0x1.00f95ced9e5f54eep+9, 1.0
+];
+
+/*
+ * Helper function: Gamma function computed by Stirling's formula.
+ *
+ * Stirling's formula for the gamma function is:
+ *
+ * $(GAMMA)(x) = sqrt(2 &pi;) x<sup>x-0.5</sup> exp(-x) (1 + 1/x P(1/x))
+ *
+ */
+real gammaStirling(real x)
+{
+ // CEPHES code Copyright 1994 by Stephen L. Moshier
+
+ static immutable real[9] SmallStirlingCoeffs = [
+ 0x1.55555555555543aap-4, 0x1.c71c71c720dd8792p-9, -0x1.5f7268f0b5907438p-9,
+ -0x1.e13cd410e0477de6p-13, 0x1.9b0f31643442616ep-11, 0x1.2527623a3472ae08p-14,
+ -0x1.37f6bc8ef8b374dep-11,-0x1.8c968886052b872ap-16, 0x1.76baa9c6d3eeddbcp-11
+ ];
+
+ static immutable real[7] LargeStirlingCoeffs = [ 1.0L,
+ 8.33333333333333333333E-2L, 3.47222222222222222222E-3L,
+ -2.68132716049382716049E-3L, -2.29472093621399176955E-4L,
+ 7.84039221720066627474E-4L, 6.97281375836585777429E-5L
+ ];
+
+ real w = 1.0L/x;
+ real y = exp(x);
+ if ( x > 1024.0L )
+ {
+ // For large x, use rational coefficients from the analytical expansion.
+ w = poly(w, LargeStirlingCoeffs);
+ // Avoid overflow in pow()
+ real v = pow( x, 0.5L * x - 0.25L );
+ y = v * (v / y);
+ }
+ else
+ {
+ w = 1.0L + w * poly( w, SmallStirlingCoeffs);
+ static if (floatTraits!(real).realFormat == RealFormat.ieeeDouble)
+ {
+ // Avoid overflow in pow() for 64-bit reals
+ if (x > 143.0)
+ {
+ real v = pow( x, 0.5 * x - 0.25 );
+ y = v * (v / y);
+ }
+ else
+ {
+ y = pow( x, x - 0.5 ) / y;
+ }
+ }
+ else
+ {
+ y = pow( x, x - 0.5L ) / y;
+ }
+ }
+ y = SQRT2PI * y * w;
+ return y;
+}
+
+/*
+ * Helper function: Incomplete gamma function computed by Temme's expansion.
+ *
+ * This is a port of igamma_temme_large from Boost.
+ *
+ */
+real igammaTemmeLarge(real a, real x)
+{
+ static immutable real[][13] coef = [
+ [ -0.333333333333333333333, 0.0833333333333333333333,
+ -0.0148148148148148148148, 0.00115740740740740740741,
+ 0.000352733686067019400353, -0.0001787551440329218107,
+ 0.39192631785224377817e-4, -0.218544851067999216147e-5,
+ -0.18540622107151599607e-5, 0.829671134095308600502e-6,
+ -0.176659527368260793044e-6, 0.670785354340149858037e-8,
+ 0.102618097842403080426e-7, -0.438203601845335318655e-8,
+ 0.914769958223679023418e-9, -0.255141939949462497669e-10,
+ -0.583077213255042506746e-10, 0.243619480206674162437e-10,
+ -0.502766928011417558909e-11 ],
+ [ -0.00185185185185185185185, -0.00347222222222222222222,
+ 0.00264550264550264550265, -0.000990226337448559670782,
+ 0.000205761316872427983539, -0.40187757201646090535e-6,
+ -0.18098550334489977837e-4, 0.764916091608111008464e-5,
+ -0.161209008945634460038e-5, 0.464712780280743434226e-8,
+ 0.137863344691572095931e-6, -0.575254560351770496402e-7,
+ 0.119516285997781473243e-7, -0.175432417197476476238e-10,
+ -0.100915437106004126275e-8, 0.416279299184258263623e-9,
+ -0.856390702649298063807e-10 ],
+ [ 0.00413359788359788359788, -0.00268132716049382716049,
+ 0.000771604938271604938272, 0.200938786008230452675e-5,
+ -0.000107366532263651605215, 0.529234488291201254164e-4,
+ -0.127606351886187277134e-4, 0.342357873409613807419e-7,
+ 0.137219573090629332056e-5, -0.629899213838005502291e-6,
+ 0.142806142060642417916e-6, -0.204770984219908660149e-9,
+ -0.140925299108675210533e-7, 0.622897408492202203356e-8,
+ -0.136704883966171134993e-8 ],
+ [ 0.000649434156378600823045, 0.000229472093621399176955,
+ -0.000469189494395255712128, 0.000267720632062838852962,
+ -0.756180167188397641073e-4, -0.239650511386729665193e-6,
+ 0.110826541153473023615e-4, -0.56749528269915965675e-5,
+ 0.142309007324358839146e-5, -0.278610802915281422406e-10,
+ -0.169584040919302772899e-6, 0.809946490538808236335e-7,
+ -0.191111684859736540607e-7 ],
+ [ -0.000861888290916711698605, 0.000784039221720066627474,
+ -0.000299072480303190179733, -0.146384525788434181781e-5,
+ 0.664149821546512218666e-4, -0.396836504717943466443e-4,
+ 0.113757269706784190981e-4, 0.250749722623753280165e-9,
+ -0.169541495365583060147e-5, 0.890750753220530968883e-6,
+ -0.229293483400080487057e-6],
+ [ -0.000336798553366358150309, -0.697281375836585777429e-4,
+ 0.000277275324495939207873, -0.000199325705161888477003,
+ 0.679778047793720783882e-4, 0.141906292064396701483e-6,
+ -0.135940481897686932785e-4, 0.801847025633420153972e-5,
+ -0.229148117650809517038e-5 ],
+ [ 0.000531307936463992223166, -0.000592166437353693882865,
+ 0.000270878209671804482771, 0.790235323266032787212e-6,
+ -0.815396936756196875093e-4, 0.561168275310624965004e-4,
+ -0.183291165828433755673e-4, -0.307961345060330478256e-8,
+ 0.346515536880360908674e-5, -0.20291327396058603727e-5,
+ 0.57887928631490037089e-6 ],
+ [ 0.000344367606892377671254, 0.517179090826059219337e-4,
+ -0.000334931610811422363117, 0.000281269515476323702274,
+ -0.000109765822446847310235, -0.127410090954844853795e-6,
+ 0.277444515115636441571e-4, -0.182634888057113326614e-4,
+ 0.578769494973505239894e-5 ],
+ [ -0.000652623918595309418922, 0.000839498720672087279993,
+ -0.000438297098541721005061, -0.696909145842055197137e-6,
+ 0.000166448466420675478374, -0.000127835176797692185853,
+ 0.462995326369130429061e-4 ],
+ [ -0.000596761290192746250124, -0.720489541602001055909e-4,
+ 0.000678230883766732836162, -0.0006401475260262758451,
+ 0.000277501076343287044992 ],
+ [ 0.00133244544948006563713, -0.0019144384985654775265,
+ 0.00110893691345966373396 ],
+ [ 0.00157972766073083495909, 0.000162516262783915816899,
+ -0.00206334210355432762645, 0.00213896861856890981541,
+ -0.00101085593912630031708 ],
+ [ -0.00407251211951401664727, 0.00640336283380806979482,
+ -0.00404101610816766177474 ]
+ ];
+
+ // avoid nans when one of the arguments is inf:
+ if (x == real.infinity && a != real.infinity)
+ return 0;
+
+ if (x != real.infinity && a == real.infinity)
+ return 1;
+
+ real sigma = (x - a) / a;
+ real phi = sigma - log(sigma + 1);
+
+ real y = a * phi;
+ real z = sqrt(2 * phi);
+ if (x < a)
+ z = -z;
+
+ real[13] workspace;
+ foreach (i; 0 .. coef.length)
+ workspace[i] = poly(z, coef[i]);
+
+ real result = poly(1 / a, workspace);
+ result *= exp(-y) / sqrt(2 * PI * a);
+ if (x < a)
+ result = -result;
+
+ result += erfc(sqrt(y)) / 2;
+
+ return result;
+}
+
+} // private
+
+public:
+/// The maximum value of x for which gamma(x) < real.infinity.
+static if (floatTraits!(real).realFormat == RealFormat.ieeeQuadruple)
+ enum real MAXGAMMA = 1755.5483429L;
+else static if (floatTraits!(real).realFormat == RealFormat.ieeeExtended)
+ enum real MAXGAMMA = 1755.5483429L;
+else static if (floatTraits!(real).realFormat == RealFormat.ieeeDouble)
+ enum real MAXGAMMA = 171.6243769L;
+else
+ static assert(0, "missing MAXGAMMA for other real types");
+
+
+/*****************************************************
+ * The Gamma function, $(GAMMA)(x)
+ *
+ * $(GAMMA)(x) is a generalisation of the factorial function
+ * to real and complex numbers.
+ * Like x!, $(GAMMA)(x+1) = x*$(GAMMA)(x).
+ *
+ * Mathematically, if z.re > 0 then
+ * $(GAMMA)(z) = $(INTEGRATE 0, &infin;) $(POWER t, z-1)$(POWER e, -t) dt
+ *
+ * $(TABLE_SV
+ * $(SVH x, $(GAMMA)(x) )
+ * $(SV $(NAN), $(NAN) )
+ * $(SV &plusmn;0.0, &plusmn;&infin;)
+ * $(SV integer > 0, (x-1)! )
+ * $(SV integer < 0, $(NAN) )
+ * $(SV +&infin;, +&infin; )
+ * $(SV -&infin;, $(NAN) )
+ * )
+ */
+real gamma(real x)
+{
+/* Based on code from the CEPHES library.
+ * CEPHES code Copyright 1994 by Stephen L. Moshier
+ *
+ * Arguments |x| <= 13 are reduced by recurrence and the function
+ * approximated by a rational function of degree 7/8 in the
+ * interval (2,3). Large arguments are handled by Stirling's
+ * formula. Large negative arguments are made positive using
+ * a reflection formula.
+ */
+
+ real q, z;
+ if (isNaN(x)) return x;
+ if (x == -x.infinity) return real.nan;
+ if ( fabs(x) > MAXGAMMA ) return real.infinity;
+ if (x == 0) return 1.0 / x; // +- infinity depending on sign of x, create an exception.
+
+ q = fabs(x);
+
+ if ( q > 13.0L )
+ {
+ // Large arguments are handled by Stirling's
+ // formula. Large negative arguments are made positive using
+ // the reflection formula.
+
+ if ( x < 0.0L )
+ {
+ if (x < -1/real.epsilon)
+ {
+ // Large negatives lose all precision
+ return real.nan;
+ }
+ int sgngam = 1; // sign of gamma.
+ long intpart = cast(long)(q);
+ if (q == intpart)
+ return real.nan; // poles for all integers <0.
+ real p = intpart;
+ if ( (intpart & 1) == 0 )
+ sgngam = -1;
+ z = q - p;
+ if ( z > 0.5L )
+ {
+ p += 1.0L;
+ z = q - p;
+ }
+ z = q * sin( PI * z );
+ z = fabs(z) * gammaStirling(q);
+ if ( z <= PI/real.max ) return sgngam * real.infinity;
+ return sgngam * PI/z;
+ }
+ else
+ {
+ return gammaStirling(x);
+ }
+ }
+
+ // Arguments |x| <= 13 are reduced by recurrence and the function
+ // approximated by a rational function of degree 7/8 in the
+ // interval (2,3).
+
+ z = 1.0L;
+ while ( x >= 3.0L )
+ {
+ x -= 1.0L;
+ z *= x;
+ }
+
+ while ( x < -0.03125L )
+ {
+ z /= x;
+ x += 1.0L;
+ }
+
+ if ( x <= 0.03125L )
+ {
+ if ( x == 0.0L )
+ return real.nan;
+ else
+ {
+ if ( x < 0.0L )
+ {
+ x = -x;
+ return z / (x * poly( x, GammaSmallNegCoeffs ));
+ }
+ else
+ {
+ return z / (x * poly( x, GammaSmallCoeffs ));
+ }
+ }
+ }
+
+ while ( x < 2.0L )
+ {
+ z /= x;
+ x += 1.0L;
+ }
+ if ( x == 2.0L ) return z;
+
+ x -= 2.0L;
+ return z * poly( x, GammaNumeratorCoeffs ) / poly( x, GammaDenominatorCoeffs );
+}
+
+@safe unittest
+{
+ // gamma(n) = factorial(n-1) if n is an integer.
+ real fact = 1.0L;
+ for (int i=1; fact<real.max; ++i)
+ {
+ // Require exact equality for small factorials
+ if (i<14) assert(gamma(i*1.0L) == fact);
+ assert(feqrel(gamma(i*1.0L), fact) >= real.mant_dig-15);
+ fact *= (i*1.0L);
+ }
+ assert(gamma(0.0) == real.infinity);
+ assert(gamma(-0.0) == -real.infinity);
+ assert(isNaN(gamma(-1.0)));
+ assert(isNaN(gamma(-15.0)));
+ assert(isIdentical(gamma(NaN(0xABC)), NaN(0xABC)));
+ assert(gamma(real.infinity) == real.infinity);
+ assert(gamma(real.max) == real.infinity);
+ assert(isNaN(gamma(-real.infinity)));
+ assert(gamma(real.min_normal*real.epsilon) == real.infinity);
+ assert(gamma(MAXGAMMA)< real.infinity);
+ assert(gamma(MAXGAMMA*2) == real.infinity);
+
+ // Test some high-precision values (50 decimal digits)
+ real SQRT_PI = 1.77245385090551602729816748334114518279754945612238L;
+
+
+ assert(feqrel(gamma(0.5L), SQRT_PI) >= real.mant_dig-1);
+ assert(feqrel(gamma(17.25L), 4.224986665692703551570937158682064589938e13L) >= real.mant_dig-4);
+
+ assert(feqrel(gamma(1.0 / 3.0L), 2.67893853470774763365569294097467764412868937795730L) >= real.mant_dig-2);
+ assert(feqrel(gamma(0.25L),
+ 3.62560990822190831193068515586767200299516768288006L) >= real.mant_dig-1);
+ assert(feqrel(gamma(1.0 / 5.0L),
+ 4.59084371199880305320475827592915200343410999829340L) >= real.mant_dig-1);
+}
+
+/*****************************************************
+ * Natural logarithm of gamma function.
+ *
+ * Returns the base e (2.718...) logarithm of the absolute
+ * value of the gamma function of the argument.
+ *
+ * For reals, logGamma is equivalent to log(fabs(gamma(x))).
+ *
+ * $(TABLE_SV
+ * $(SVH x, logGamma(x) )
+ * $(SV $(NAN), $(NAN) )
+ * $(SV integer <= 0, +&infin; )
+ * $(SV &plusmn;&infin;, +&infin; )
+ * )
+ */
+real logGamma(real x)
+{
+ /* Based on code from the CEPHES library.
+ * CEPHES code Copyright 1994 by Stephen L. Moshier
+ *
+ * For arguments greater than 33, the logarithm of the gamma
+ * function is approximated by the logarithmic version of
+ * Stirling's formula using a polynomial approximation of
+ * degree 4. Arguments between -33 and +33 are reduced by
+ * recurrence to the interval [2,3] of a rational approximation.
+ * The cosecant reflection formula is employed for arguments
+ * less than -33.
+ */
+ real q, w, z, f, nx;
+
+ if (isNaN(x)) return x;
+ if (fabs(x) == x.infinity) return x.infinity;
+
+ if ( x < -34.0L )
+ {
+ q = -x;
+ w = logGamma(q);
+ real p = floor(q);
+ if ( p == q )
+ return real.infinity;
+ int intpart = cast(int)(p);
+ real sgngam = 1;
+ if ( (intpart & 1) == 0 )
+ sgngam = -1;
+ z = q - p;
+ if ( z > 0.5L )
+ {
+ p += 1.0L;
+ z = p - q;
+ }
+ z = q * sin( PI * z );
+ if ( z == 0.0L )
+ return sgngam * real.infinity;
+ /* z = LOGPI - logl( z ) - w; */
+ z = log( PI/z ) - w;
+ return z;
+ }
+
+ if ( x < 13.0L )
+ {
+ z = 1.0L;
+ nx = floor( x + 0.5L );
+ f = x - nx;
+ while ( x >= 3.0L )
+ {
+ nx -= 1.0L;
+ x = nx + f;
+ z *= x;
+ }
+ while ( x < 2.0L )
+ {
+ if ( fabs(x) <= 0.03125 )
+ {
+ if ( x == 0.0L )
+ return real.infinity;
+ if ( x < 0.0L )
+ {
+ x = -x;
+ q = z / (x * poly( x, GammaSmallNegCoeffs));
+ } else
+ q = z / (x * poly( x, GammaSmallCoeffs));
+ return log( fabs(q) );
+ }
+ z /= nx + f;
+ nx += 1.0L;
+ x = nx + f;
+ }
+ z = fabs(z);
+ if ( x == 2.0L )
+ return log(z);
+ x = (nx - 2.0L) + f;
+ real p = x * rationalPoly( x, logGammaNumerator, logGammaDenominator);
+ return log(z) + p;
+ }
+
+ // const real MAXLGM = 1.04848146839019521116e+4928L;
+ // if ( x > MAXLGM ) return sgngaml * real.infinity;
+
+ const real LOGSQRT2PI = 0.91893853320467274178L; // log( sqrt( 2*pi ) )
+
+ q = ( x - 0.5L ) * log(x) - x + LOGSQRT2PI;
+ if (x > 1.0e10L) return q;
+ real p = 1.0L / (x*x);
+ q += poly( p, logGammaStirlingCoeffs ) / x;
+ return q ;
+}
+
+@safe unittest
+{
+ assert(isIdentical(logGamma(NaN(0xDEF)), NaN(0xDEF)));
+ assert(logGamma(real.infinity) == real.infinity);
+ assert(logGamma(-1.0) == real.infinity);
+ assert(logGamma(0.0) == real.infinity);
+ assert(logGamma(-50.0) == real.infinity);
+ assert(isIdentical(0.0L, logGamma(1.0L)));
+ assert(isIdentical(0.0L, logGamma(2.0L)));
+ assert(logGamma(real.min_normal*real.epsilon) == real.infinity);
+ assert(logGamma(-real.min_normal*real.epsilon) == real.infinity);
+
+ // x, correct loggamma(x), correct d/dx loggamma(x).
+ immutable static real[] testpoints = [
+ 8.0L, 8.525146484375L + 1.48766904143001655310E-5, 2.01564147795560999654E0L,
+ 8.99993896484375e-1L, 6.6375732421875e-2L + 5.11505711292524166220E-6L, -7.54938684259372234258E-1,
+ 7.31597900390625e-1L, 2.2369384765625e-1 + 5.21506341809849792422E-6L, -1.13355566660398608343E0L,
+ 2.31639862060546875e-1L, 1.3686676025390625L + 1.12609441752996145670E-5L, -4.56670961813812679012E0,
+ 1.73162841796875L, -8.88214111328125e-2L + 3.36207740803753034508E-6L, 2.33339034686200586920E-1L,
+ 1.23162841796875L, -9.3902587890625e-2L + 1.28765089229009648104E-5L, -2.49677345775751390414E-1L,
+ 7.3786976294838206464e19L, 3.301798506038663053312e21L - 1.656137564136932662487046269677E5L,
+ 4.57477139169563904215E1L,
+ 1.08420217248550443401E-19L, 4.36682586669921875e1L + 1.37082843669932230418E-5L,
+ -9.22337203685477580858E18L,
+ 1.0L, 0.0L, -5.77215664901532860607E-1L,
+ 2.0L, 0.0L, 4.22784335098467139393E-1L,
+ -0.5L, 1.2655029296875L + 9.19379714539648894580E-6L, 3.64899739785765205590E-2L,
+ -1.5L, 8.6004638671875e-1L + 6.28657731014510932682E-7L, 7.03156640645243187226E-1L,
+ -2.5L, -5.6243896484375E-2L + 1.79986700949327405470E-7, 1.10315664064524318723E0L,
+ -3.5L, -1.30902099609375L + 1.43111007079536392848E-5L, 1.38887092635952890151E0L
+ ];
+ // TODO: test derivatives as well.
+ for (int i=0; i<testpoints.length; i+=3)
+ {
+ assert( feqrel(logGamma(testpoints[i]), testpoints[i+1]) > real.mant_dig-5);
+ if (testpoints[i]<MAXGAMMA)
+ {
+ assert( feqrel(log(fabs(gamma(testpoints[i]))), testpoints[i+1]) > real.mant_dig-5);
+ }
+ }
+ assert(logGamma(-50.2) == log(fabs(gamma(-50.2))));
+ assert(logGamma(-0.008) == log(fabs(gamma(-0.008))));
+ assert(feqrel(logGamma(-38.8),log(fabs(gamma(-38.8)))) > real.mant_dig-4);
+ static if (real.mant_dig >= 64) // incl. 80-bit reals
+ assert(feqrel(logGamma(1500.0L),log(gamma(1500.0L))) > real.mant_dig-2);
+ else static if (real.mant_dig >= 53) // incl. 64-bit reals
+ assert(feqrel(logGamma(150.0L),log(gamma(150.0L))) > real.mant_dig-2);
+}
+
+
+private {
+/*
+ * These value can be calculated like this:
+ * 1) Get exact real.max/min_normal/epsilon from compiler:
+ * writefln!"%a"(real.max/min_normal_epsilon)
+ * 2) Convert for Wolfram Alpha
+ * 0xf.fffffffffffffffp+16380 ==> (f.fffffffffffffff base 16) * 2^16380
+ * 3) Calculate result on wofram alpha:
+ * http://www.wolframalpha.com/input/?i=ln((1.ffffffffffffffffffffffffffff+base+16)+*+2%5E16383)+in+base+2
+ * 4) Convert to proper format:
+ * string mantissa = "1.011...";
+ * write(mantissa[0 .. 2]); mantissa = mantissa[2 .. $];
+ * for (size_t i = 0; i < mantissa.length/4; i++)
+ * {
+ * writef!"%x"(to!ubyte(mantissa[0 .. 4], 2)); mantissa = mantissa[4 .. $];
+ * }
+ */
+static if (floatTraits!(real).realFormat == RealFormat.ieeeQuadruple)
+{
+ enum real MAXLOG = 0x1.62e42fefa39ef35793c7673007e6p+13; // log(real.max)
+ enum real MINLOG = -0x1.6546282207802c89d24d65e96274p+13; // log(real.min_normal*real.epsilon) = log(smallest denormal)
+}
+else static if (floatTraits!(real).realFormat == RealFormat.ieeeExtended)
+{
+ enum real MAXLOG = 0x1.62e42fefa39ef358p+13L; // log(real.max)
+ enum real MINLOG = -0x1.6436716d5406e6d8p+13L; // log(real.min_normal*real.epsilon) = log(smallest denormal)
+}
+else static if (floatTraits!(real).realFormat == RealFormat.ieeeDouble)
+{
+ enum real MAXLOG = 0x1.62e42fefa39efp+9L; // log(real.max)
+ enum real MINLOG = -0x1.74385446d71c3p+9L; // log(real.min_normal*real.epsilon) = log(smallest denormal)
+}
+else
+ static assert(0, "missing MAXLOG and MINLOG for other real types");
+
+enum real BETA_BIG = 9.223372036854775808e18L;
+enum real BETA_BIGINV = 1.084202172485504434007e-19L;
+}
+
+/** Incomplete beta integral
+ *
+ * Returns incomplete beta integral of the arguments, evaluated
+ * from zero to x. The regularized incomplete beta function is defined as
+ *
+ * betaIncomplete(a, b, x) = &Gamma;(a+b)/(&Gamma;(a) &Gamma;(b)) *
+ * $(INTEGRATE 0, x) $(POWER t, a-1)$(POWER (1-t),b-1) dt
+ *
+ * and is the same as the the cumulative distribution function.
+ *
+ * The domain of definition is 0 <= x <= 1. In this
+ * implementation a and b are restricted to positive values.
+ * The integral from x to 1 may be obtained by the symmetry
+ * relation
+ *
+ * betaIncompleteCompl(a, b, x ) = betaIncomplete( b, a, 1-x )
+ *
+ * The integral is evaluated by a continued fraction expansion
+ * or, when b*x is small, by a power series.
+ */
+real betaIncomplete(real aa, real bb, real xx )
+{
+ if ( !(aa>0 && bb>0) )
+ {
+ if ( isNaN(aa) ) return aa;
+ if ( isNaN(bb) ) return bb;
+ return real.nan; // domain error
+ }
+ if (!(xx>0 && xx<1.0))
+ {
+ if (isNaN(xx)) return xx;
+ if ( xx == 0.0L ) return 0.0;
+ if ( xx == 1.0L ) return 1.0;
+ return real.nan; // domain error
+ }
+ if ( (bb * xx) <= 1.0L && xx <= 0.95L)
+ {
+ return betaDistPowerSeries(aa, bb, xx);
+ }
+ real x;
+ real xc; // = 1 - x
+
+ real a, b;
+ int flag = 0;
+
+ /* Reverse a and b if x is greater than the mean. */
+ if ( xx > (aa/(aa+bb)) )
+ {
+ // here x > aa/(aa+bb) and (bb*x>1 or x>0.95)
+ flag = 1;
+ a = bb;
+ b = aa;
+ xc = xx;
+ x = 1.0L - xx;
+ }
+ else
+ {
+ a = aa;
+ b = bb;
+ xc = 1.0L - xx;
+ x = xx;
+ }
+
+ if ( flag == 1 && (b * x) <= 1.0L && x <= 0.95L)
+ {
+ // here xx > aa/(aa+bb) and ((bb*xx>1) or xx>0.95) and (aa*(1-xx)<=1) and xx > 0.05
+ return 1.0 - betaDistPowerSeries(a, b, x); // note loss of precision
+ }
+
+ real w;
+ // Choose expansion for optimal convergence
+ // One is for x * (a+b+2) < (a+1),
+ // the other is for x * (a+b+2) > (a+1).
+ real y = x * (a+b-2.0L) - (a-1.0L);
+ if ( y < 0.0L )
+ {
+ w = betaDistExpansion1( a, b, x );
+ }
+ else
+ {
+ w = betaDistExpansion2( a, b, x ) / xc;
+ }
+
+ /* Multiply w by the factor
+ a b
+ x (1-x) Gamma(a+b) / ( a Gamma(a) Gamma(b) ) . */
+
+ y = a * log(x);
+ real t = b * log(xc);
+ if ( (a+b) < MAXGAMMA && fabs(y) < MAXLOG && fabs(t) < MAXLOG )
+ {
+ t = pow(xc,b);
+ t *= pow(x,a);
+ t /= a;
+ t *= w;
+ t *= gamma(a+b) / (gamma(a) * gamma(b));
+ }
+ else
+ {
+ /* Resort to logarithms. */
+ y += t + logGamma(a+b) - logGamma(a) - logGamma(b);
+ y += log(w/a);
+
+ t = exp(y);
+/+
+ // There seems to be a bug in Cephes at this point.
+ // Problems occur for y > MAXLOG, not y < MINLOG.
+ if ( y < MINLOG )
+ {
+ t = 0.0L;
+ }
+ else
+ {
+ t = exp(y);
+ }
++/
+ }
+ if ( flag == 1 )
+ {
+/+ // CEPHES includes this code, but I think it is erroneous.
+ if ( t <= real.epsilon )
+ {
+ t = 1.0L - real.epsilon;
+ } else
++/
+ t = 1.0L - t;
+ }
+ return t;
+}
+
+/** Inverse of incomplete beta integral
+ *
+ * Given y, the function finds x such that
+ *
+ * betaIncomplete(a, b, x) == y
+ *
+ * Newton iterations or interval halving is used.
+ */
+real betaIncompleteInv(real aa, real bb, real yy0 )
+{
+ real a, b, y0, d, y, x, x0, x1, lgm, yp, di, dithresh, yl, yh, xt;
+ int i, rflg, dir, nflg;
+
+ if (isNaN(yy0)) return yy0;
+ if (isNaN(aa)) return aa;
+ if (isNaN(bb)) return bb;
+ if ( yy0 <= 0.0L )
+ return 0.0L;
+ if ( yy0 >= 1.0L )
+ return 1.0L;
+ x0 = 0.0L;
+ yl = 0.0L;
+ x1 = 1.0L;
+ yh = 1.0L;
+ if ( aa <= 1.0L || bb <= 1.0L )
+ {
+ dithresh = 1.0e-7L;
+ rflg = 0;
+ a = aa;
+ b = bb;
+ y0 = yy0;
+ x = a/(a+b);
+ y = betaIncomplete( a, b, x );
+ nflg = 0;
+ goto ihalve;
+ }
+ else
+ {
+ nflg = 0;
+ dithresh = 1.0e-4L;
+ }
+
+ // approximation to inverse function
+
+ yp = -normalDistributionInvImpl( yy0 );
+
+ if ( yy0 > 0.5L )
+ {
+ rflg = 1;
+ a = bb;
+ b = aa;
+ y0 = 1.0L - yy0;
+ yp = -yp;
+ }
+ else
+ {
+ rflg = 0;
+ a = aa;
+ b = bb;
+ y0 = yy0;
+ }
+
+ lgm = (yp * yp - 3.0L)/6.0L;
+ x = 2.0L/( 1.0L/(2.0L * a-1.0L) + 1.0L/(2.0L * b - 1.0L) );
+ d = yp * sqrt( x + lgm ) / x
+ - ( 1.0L/(2.0L * b - 1.0L) - 1.0L/(2.0L * a - 1.0L) )
+ * (lgm + (5.0L/6.0L) - 2.0L/(3.0L * x));
+ d = 2.0L * d;
+ if ( d < MINLOG )
+ {
+ x = 1.0L;
+ goto under;
+ }
+ x = a/( a + b * exp(d) );
+ y = betaIncomplete( a, b, x );
+ yp = (y - y0)/y0;
+ if ( fabs(yp) < 0.2 )
+ goto newt;
+
+ /* Resort to interval halving if not close enough. */
+ihalve:
+
+ dir = 0;
+ di = 0.5L;
+ for ( i=0; i<400; i++ )
+ {
+ if ( i != 0 )
+ {
+ x = x0 + di * (x1 - x0);
+ if ( x == 1.0L )
+ {
+ x = 1.0L - real.epsilon;
+ }
+ if ( x == 0.0L )
+ {
+ di = 0.5;
+ x = x0 + di * (x1 - x0);
+ if ( x == 0.0 )
+ goto under;
+ }
+ y = betaIncomplete( a, b, x );
+ yp = (x1 - x0)/(x1 + x0);
+ if ( fabs(yp) < dithresh )
+ goto newt;
+ yp = (y-y0)/y0;
+ if ( fabs(yp) < dithresh )
+ goto newt;
+ }
+ if ( y < y0 )
+ {
+ x0 = x;
+ yl = y;
+ if ( dir < 0 )
+ {
+ dir = 0;
+ di = 0.5L;
+ } else if ( dir > 3 )
+ di = 1.0L - (1.0L - di) * (1.0L - di);
+ else if ( dir > 1 )
+ di = 0.5L * di + 0.5L;
+ else
+ di = (y0 - y)/(yh - yl);
+ dir += 1;
+ if ( x0 > 0.95L )
+ {
+ if ( rflg == 1 )
+ {
+ rflg = 0;
+ a = aa;
+ b = bb;
+ y0 = yy0;
+ }
+ else
+ {
+ rflg = 1;
+ a = bb;
+ b = aa;
+ y0 = 1.0 - yy0;
+ }
+ x = 1.0L - x;
+ y = betaIncomplete( a, b, x );
+ x0 = 0.0;
+ yl = 0.0;
+ x1 = 1.0;
+ yh = 1.0;
+ goto ihalve;
+ }
+ }
+ else
+ {
+ x1 = x;
+ if ( rflg == 1 && x1 < real.epsilon )
+ {
+ x = 0.0L;
+ goto done;
+ }
+ yh = y;
+ if ( dir > 0 )
+ {
+ dir = 0;
+ di = 0.5L;
+ }
+ else if ( dir < -3 )
+ di = di * di;
+ else if ( dir < -1 )
+ di = 0.5L * di;
+ else
+ di = (y - y0)/(yh - yl);
+ dir -= 1;
+ }
+ }
+ if ( x0 >= 1.0L )
+ {
+ // partial loss of precision
+ x = 1.0L - real.epsilon;
+ goto done;
+ }
+ if ( x <= 0.0L )
+ {
+under:
+ // underflow has occurred
+ x = real.min_normal * real.min_normal;
+ goto done;
+ }
+
+newt:
+
+ if ( nflg )
+ {
+ goto done;
+ }
+ nflg = 1;
+ lgm = logGamma(a+b) - logGamma(a) - logGamma(b);
+
+ for ( i=0; i<15; i++ )
+ {
+ /* Compute the function at this point. */
+ if ( i != 0 )
+ y = betaIncomplete(a,b,x);
+ if ( y < yl )
+ {
+ x = x0;
+ y = yl;
+ }
+ else if ( y > yh )
+ {
+ x = x1;
+ y = yh;
+ }
+ else if ( y < y0 )
+ {
+ x0 = x;
+ yl = y;
+ }
+ else
+ {
+ x1 = x;
+ yh = y;
+ }
+ if ( x == 1.0L || x == 0.0L )
+ break;
+ /* Compute the derivative of the function at this point. */
+ d = (a - 1.0L) * log(x) + (b - 1.0L) * log(1.0L - x) + lgm;
+ if ( d < MINLOG )
+ {
+ goto done;
+ }
+ if ( d > MAXLOG )
+ {
+ break;
+ }
+ d = exp(d);
+ /* Compute the step to the next approximation of x. */
+ d = (y - y0)/d;
+ xt = x - d;
+ if ( xt <= x0 )
+ {
+ y = (x - x0) / (x1 - x0);
+ xt = x0 + 0.5L * y * (x - x0);
+ if ( xt <= 0.0L )
+ break;
+ }
+ if ( xt >= x1 )
+ {
+ y = (x1 - x) / (x1 - x0);
+ xt = x1 - 0.5L * y * (x1 - x);
+ if ( xt >= 1.0L )
+ break;
+ }
+ x = xt;
+ if ( fabs(d/x) < (128.0L * real.epsilon) )
+ goto done;
+ }
+ /* Did not converge. */
+ dithresh = 256.0L * real.epsilon;
+ goto ihalve;
+
+done:
+ if ( rflg )
+ {
+ if ( x <= real.epsilon )
+ x = 1.0L - real.epsilon;
+ else
+ x = 1.0L - x;
+ }
+ return x;
+}
+
+@safe unittest { // also tested by the normal distribution
+ // check NaN propagation
+ assert(isIdentical(betaIncomplete(NaN(0xABC),2,3), NaN(0xABC)));
+ assert(isIdentical(betaIncomplete(7,NaN(0xABC),3), NaN(0xABC)));
+ assert(isIdentical(betaIncomplete(7,15,NaN(0xABC)), NaN(0xABC)));
+ assert(isIdentical(betaIncompleteInv(NaN(0xABC),1,17), NaN(0xABC)));
+ assert(isIdentical(betaIncompleteInv(2,NaN(0xABC),8), NaN(0xABC)));
+ assert(isIdentical(betaIncompleteInv(2,3, NaN(0xABC)), NaN(0xABC)));
+
+ assert(isNaN(betaIncomplete(-1, 2, 3)));
+
+ assert(betaIncomplete(1, 2, 0)==0);
+ assert(betaIncomplete(1, 2, 1)==1);
+ assert(isNaN(betaIncomplete(1, 2, 3)));
+ assert(betaIncompleteInv(1, 1, 0)==0);
+ assert(betaIncompleteInv(1, 1, 1)==1);
+
+ // Test against Mathematica betaRegularized[z,a,b]
+ // These arbitrary points are chosen to give good code coverage.
+ assert(feqrel(betaIncomplete(8, 10, 0.2), 0.010_934_315_234_099_2L) >= real.mant_dig - 5);
+ assert(feqrel(betaIncomplete(2, 2.5, 0.9), 0.989_722_597_604_452_767_171_003_59L) >= real.mant_dig - 1);
+ static if (real.mant_dig >= 64) // incl. 80-bit reals
+ assert(feqrel(betaIncomplete(1000, 800, 0.5), 1.179140859734704555102808541457164E-06L) >= real.mant_dig - 13);
+ else
+ assert(feqrel(betaIncomplete(1000, 800, 0.5), 1.179140859734704555102808541457164E-06L) >= real.mant_dig - 14);
+ assert(feqrel(betaIncomplete(0.0001, 10000, 0.0001), 0.999978059362107134278786L) >= real.mant_dig - 18);
+ assert(betaIncomplete(0.01, 327726.7, 0.545113) == 1.0);
+ assert(feqrel(betaIncompleteInv(8, 10, 0.010_934_315_234_099_2L), 0.2L) >= real.mant_dig - 2);
+ assert(feqrel(betaIncomplete(0.01, 498.437, 0.0121433), 0.99999664562033077636065L) >= real.mant_dig - 1);
+ assert(feqrel(betaIncompleteInv(5, 10, 0.2000002972865658842), 0.229121208190918L) >= real.mant_dig - 3);
+ assert(feqrel(betaIncompleteInv(4, 7, 0.8000002209179505L), 0.483657360076904L) >= real.mant_dig - 3);
+
+ // Coverage tests. I don't have correct values for these tests, but
+ // these values cover most of the code, so they are useful for
+ // regression testing.
+ // Extensive testing failed to increase the coverage. It seems likely that about
+ // half the code in this function is unnecessary; there is potential for
+ // significant improvement over the original CEPHES code.
+ static if (real.mant_dig == 64) // 80-bit reals
+ {
+ assert(betaIncompleteInv(0.01, 8e-48, 5.45464e-20) == 1-real.epsilon);
+ assert(betaIncompleteInv(0.01, 8e-48, 9e-26) == 1-real.epsilon);
+
+ // Beware: a one-bit change in pow() changes almost all digits in the result!
+ assert(feqrel(
+ betaIncompleteInv(0x1.b3d151fbba0eb18p+1, 1.2265e-19, 2.44859e-18),
+ 0x1.c0110c8531d0952cp-1L
+ ) > 10);
+ // This next case uncovered a one-bit difference in the FYL2X instruction
+ // between Intel and AMD processors. This difference gets magnified by 2^^38.
+ // WolframAlpha crashes attempting to calculate this.
+ assert(feqrel(betaIncompleteInv(0x1.ff1275ae5b939bcap-41, 4.6713e18, 0.0813601),
+ 0x1.f97749d90c7adba8p-63L) >= real.mant_dig - 39);
+ real a1 = 3.40483;
+ assert(betaIncompleteInv(a1, 4.0640301659679627772e19L, 0.545113) == 0x1.ba8c08108aaf5d14p-109);
+ real b1 = 2.82847e-25;
+ assert(feqrel(betaIncompleteInv(0.01, b1, 9e-26), 0x1.549696104490aa9p-830L) >= real.mant_dig-10);
+
+ // --- Problematic cases ---
+ // This is a situation where the series expansion fails to converge
+ assert( isNaN(betaIncompleteInv(0.12167, 4.0640301659679627772e19L, 0.0813601)));
+ // This next result is almost certainly erroneous.
+ // Mathematica states: "(cannot be determined by current methods)"
+ assert(betaIncomplete(1.16251e20, 2.18e39, 5.45e-20) == -real.infinity);
+ // WolframAlpha gives no result for this, though indicates that it approximately 1.0 - 1.3e-9
+ assert(1 - betaIncomplete(0.01, 328222, 4.0375e-5) == 0x1.5f62926b4p-30);
+ }
+}
+
+
+private {
+// Implementation functions
+
+// Continued fraction expansion #1 for incomplete beta integral
+// Use when x < (a+1)/(a+b+2)
+real betaDistExpansion1(real a, real b, real x )
+{
+ real xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+ real k1, k2, k3, k4, k5, k6, k7, k8;
+ real r, t, ans;
+ int n;
+
+ k1 = a;
+ k2 = a + b;
+ k3 = a;
+ k4 = a + 1.0L;
+ k5 = 1.0L;
+ k6 = b - 1.0L;
+ k7 = k4;
+ k8 = a + 2.0L;
+
+ pkm2 = 0.0L;
+ qkm2 = 1.0L;
+ pkm1 = 1.0L;
+ qkm1 = 1.0L;
+ ans = 1.0L;
+ r = 1.0L;
+ n = 0;
+ const real thresh = 3.0L * real.epsilon;
+ do
+ {
+ xk = -( x * k1 * k2 )/( k3 * k4 );
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ xk = ( x * k5 * k6 )/( k7 * k8 );
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ if ( qk != 0.0L )
+ r = pk/qk;
+ if ( r != 0.0L )
+ {
+ t = fabs( (ans - r)/r );
+ ans = r;
+ }
+ else
+ {
+ t = 1.0L;
+ }
+
+ if ( t < thresh )
+ return ans;
+
+ k1 += 1.0L;
+ k2 += 1.0L;
+ k3 += 2.0L;
+ k4 += 2.0L;
+ k5 += 1.0L;
+ k6 -= 1.0L;
+ k7 += 2.0L;
+ k8 += 2.0L;
+
+ if ( (fabs(qk) + fabs(pk)) > BETA_BIG )
+ {
+ pkm2 *= BETA_BIGINV;
+ pkm1 *= BETA_BIGINV;
+ qkm2 *= BETA_BIGINV;
+ qkm1 *= BETA_BIGINV;
+ }
+ if ( (fabs(qk) < BETA_BIGINV) || (fabs(pk) < BETA_BIGINV) )
+ {
+ pkm2 *= BETA_BIG;
+ pkm1 *= BETA_BIG;
+ qkm2 *= BETA_BIG;
+ qkm1 *= BETA_BIG;
+ }
+ }
+ while ( ++n < 400 );
+// loss of precision has occurred
+// mtherr( "incbetl", PLOSS );
+ return ans;
+}
+
+// Continued fraction expansion #2 for incomplete beta integral
+// Use when x > (a+1)/(a+b+2)
+real betaDistExpansion2(real a, real b, real x )
+{
+ real xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+ real k1, k2, k3, k4, k5, k6, k7, k8;
+ real r, t, ans, z;
+
+ k1 = a;
+ k2 = b - 1.0L;
+ k3 = a;
+ k4 = a + 1.0L;
+ k5 = 1.0L;
+ k6 = a + b;
+ k7 = a + 1.0L;
+ k8 = a + 2.0L;
+
+ pkm2 = 0.0L;
+ qkm2 = 1.0L;
+ pkm1 = 1.0L;
+ qkm1 = 1.0L;
+ z = x / (1.0L-x);
+ ans = 1.0L;
+ r = 1.0L;
+ int n = 0;
+ const real thresh = 3.0L * real.epsilon;
+ do
+ {
+ xk = -( z * k1 * k2 )/( k3 * k4 );
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ xk = ( z * k5 * k6 )/( k7 * k8 );
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ if ( qk != 0.0L )
+ r = pk/qk;
+ if ( r != 0.0L )
+ {
+ t = fabs( (ans - r)/r );
+ ans = r;
+ } else
+ t = 1.0L;
+
+ if ( t < thresh )
+ return ans;
+ k1 += 1.0L;
+ k2 -= 1.0L;
+ k3 += 2.0L;
+ k4 += 2.0L;
+ k5 += 1.0L;
+ k6 += 1.0L;
+ k7 += 2.0L;
+ k8 += 2.0L;
+
+ if ( (fabs(qk) + fabs(pk)) > BETA_BIG )
+ {
+ pkm2 *= BETA_BIGINV;
+ pkm1 *= BETA_BIGINV;
+ qkm2 *= BETA_BIGINV;
+ qkm1 *= BETA_BIGINV;
+ }
+ if ( (fabs(qk) < BETA_BIGINV) || (fabs(pk) < BETA_BIGINV) )
+ {
+ pkm2 *= BETA_BIG;
+ pkm1 *= BETA_BIG;
+ qkm2 *= BETA_BIG;
+ qkm1 *= BETA_BIG;
+ }
+ } while ( ++n < 400 );
+// loss of precision has occurred
+//mtherr( "incbetl", PLOSS );
+ return ans;
+}
+
+/* Power series for incomplete gamma integral.
+ Use when b*x is small. */
+real betaDistPowerSeries(real a, real b, real x )
+{
+ real ai = 1.0L / a;
+ real u = (1.0L - b) * x;
+ real v = u / (a + 1.0L);
+ real t1 = v;
+ real t = u;
+ real n = 2.0L;
+ real s = 0.0L;
+ real z = real.epsilon * ai;
+ while ( fabs(v) > z )
+ {
+ u = (n - b) * x / n;
+ t *= u;
+ v = t / (a + n);
+ s += v;
+ n += 1.0L;
+ }
+ s += t1;
+ s += ai;
+
+ u = a * log(x);
+ if ( (a+b) < MAXGAMMA && fabs(u) < MAXLOG )
+ {
+ t = gamma(a+b)/(gamma(a)*gamma(b));
+ s = s * t * pow(x,a);
+ }
+ else
+ {
+ t = logGamma(a+b) - logGamma(a) - logGamma(b) + u + log(s);
+
+ if ( t < MINLOG )
+ {
+ s = 0.0L;
+ } else
+ s = exp(t);
+ }
+ return s;
+}
+
+}
+
+/***************************************
+ * Incomplete gamma integral and its complement
+ *
+ * These functions are defined by
+ *
+ * gammaIncomplete = ( $(INTEGRATE 0, x) $(POWER e, -t) $(POWER t, a-1) dt )/ $(GAMMA)(a)
+ *
+ * gammaIncompleteCompl(a,x) = 1 - gammaIncomplete(a,x)
+ * = ($(INTEGRATE x, &infin;) $(POWER e, -t) $(POWER t, a-1) dt )/ $(GAMMA)(a)
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ */
+real gammaIncomplete(real a, real x )
+in {
+ assert(x >= 0);
+ assert(a > 0);
+}
+body {
+ /* left tail of incomplete gamma function:
+ *
+ * inf. k
+ * a -x - x
+ * x e > ----------
+ * - -
+ * k=0 | (a+k+1)
+ *
+ */
+ if (x == 0)
+ return 0.0L;
+
+ if ( (x > 1.0L) && (x > a ) )
+ return 1.0L - gammaIncompleteCompl(a,x);
+
+ real ax = a * log(x) - x - logGamma(a);
+/+
+ if ( ax < MINLOGL ) return 0; // underflow
+ // { mtherr( "igaml", UNDERFLOW ); return( 0.0L ); }
++/
+ ax = exp(ax);
+
+ /* power series */
+ real r = a;
+ real c = 1.0L;
+ real ans = 1.0L;
+
+ do
+ {
+ r += 1.0L;
+ c *= x/r;
+ ans += c;
+ } while ( c/ans > real.epsilon );
+
+ return ans * ax/a;
+}
+
+/** ditto */
+real gammaIncompleteCompl(real a, real x )
+in {
+ assert(x >= 0);
+ assert(a > 0);
+}
+body {
+ if (x == 0)
+ return 1.0L;
+ if ( (x < 1.0L) || (x < a) )
+ return 1.0L - gammaIncomplete(a,x);
+
+ // DAC (Cephes bug fix): This is necessary to avoid
+ // spurious nans, eg
+ // log(x)-x = NaN when x = real.infinity
+ const real MAXLOGL = 1.1356523406294143949492E4L;
+ if (x > MAXLOGL)
+ return igammaTemmeLarge(a, x);
+
+ real ax = a * log(x) - x - logGamma(a);
+//const real MINLOGL = -1.1355137111933024058873E4L;
+// if ( ax < MINLOGL ) return 0; // underflow;
+ ax = exp(ax);
+
+
+ /* continued fraction */
+ real y = 1.0L - a;
+ real z = x + y + 1.0L;
+ real c = 0.0L;
+
+ real pk, qk, t;
+
+ real pkm2 = 1.0L;
+ real qkm2 = x;
+ real pkm1 = x + 1.0L;
+ real qkm1 = z * x;
+ real ans = pkm1/qkm1;
+
+ do
+ {
+ c += 1.0L;
+ y += 1.0L;
+ z += 2.0L;
+ real yc = y * c;
+ pk = pkm1 * z - pkm2 * yc;
+ qk = qkm1 * z - qkm2 * yc;
+ if ( qk != 0.0L )
+ {
+ real r = pk/qk;
+ t = fabs( (ans - r)/r );
+ ans = r;
+ }
+ else
+ {
+ t = 1.0L;
+ }
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ const real BIG = 9.223372036854775808e18L;
+
+ if ( fabs(pk) > BIG )
+ {
+ pkm2 /= BIG;
+ pkm1 /= BIG;
+ qkm2 /= BIG;
+ qkm1 /= BIG;
+ }
+ } while ( t > real.epsilon );
+
+ return ans * ax;
+}
+
+/** Inverse of complemented incomplete gamma integral
+ *
+ * Given a and p, the function finds x such that
+ *
+ * gammaIncompleteCompl( a, x ) = p.
+ *
+ * Starting with the approximate value x = a $(POWER t, 3), where
+ * t = 1 - d - normalDistributionInv(p) sqrt(d),
+ * and d = 1/9a,
+ * the routine performs up to 10 Newton iterations to find the
+ * root of incompleteGammaCompl(a,x) - p = 0.
+ */
+real gammaIncompleteComplInv(real a, real p)
+in {
+ assert(p >= 0 && p <= 1);
+ assert(a>0);
+}
+body {
+ if (p == 0) return real.infinity;
+
+ real y0 = p;
+ const real MAXLOGL = 1.1356523406294143949492E4L;
+ real x0, x1, x, yl, yh, y, d, lgm, dithresh;
+ int i, dir;
+
+ /* bound the solution */
+ x0 = real.max;
+ yl = 0.0L;
+ x1 = 0.0L;
+ yh = 1.0L;
+ dithresh = 4.0 * real.epsilon;
+
+ /* approximation to inverse function */
+ d = 1.0L/(9.0L*a);
+ y = 1.0L - d - normalDistributionInvImpl(y0) * sqrt(d);
+ x = a * y * y * y;
+
+ lgm = logGamma(a);
+
+ for ( i=0; i<10; i++ )
+ {
+ if ( x > x0 || x < x1 )
+ goto ihalve;
+ y = gammaIncompleteCompl(a,x);
+ if ( y < yl || y > yh )
+ goto ihalve;
+ if ( y < y0 )
+ {
+ x0 = x;
+ yl = y;
+ }
+ else
+ {
+ x1 = x;
+ yh = y;
+ }
+ /* compute the derivative of the function at this point */
+ d = (a - 1.0L) * log(x0) - x0 - lgm;
+ if ( d < -MAXLOGL )
+ goto ihalve;
+ d = -exp(d);
+ /* compute the step to the next approximation of x */
+ d = (y - y0)/d;
+ x = x - d;
+ if ( i < 3 ) continue;
+ if ( fabs(d/x) < dithresh ) return x;
+ }
+
+ /* Resort to interval halving if Newton iteration did not converge. */
+ihalve:
+ d = 0.0625L;
+ if ( x0 == real.max )
+ {
+ if ( x <= 0.0L )
+ x = 1.0L;
+ while ( x0 == real.max )
+ {
+ x = (1.0L + d) * x;
+ y = gammaIncompleteCompl( a, x );
+ if ( y < y0 )
+ {
+ x0 = x;
+ yl = y;
+ break;
+ }
+ d = d + d;
+ }
+ }
+ d = 0.5L;
+ dir = 0;
+
+ for ( i=0; i<400; i++ )
+ {
+ x = x1 + d * (x0 - x1);
+ y = gammaIncompleteCompl( a, x );
+ lgm = (x0 - x1)/(x1 + x0);
+ if ( fabs(lgm) < dithresh )
+ break;
+ lgm = (y - y0)/y0;
+ if ( fabs(lgm) < dithresh )
+ break;
+ if ( x <= 0.0L )
+ break;
+ if ( y > y0 )
+ {
+ x1 = x;
+ yh = y;
+ if ( dir < 0 )
+ {
+ dir = 0;
+ d = 0.5L;
+ } else if ( dir > 1 )
+ d = 0.5L * d + 0.5L;
+ else
+ d = (y0 - yl)/(yh - yl);
+ dir += 1;
+ }
+ else
+ {
+ x0 = x;
+ yl = y;
+ if ( dir > 0 )
+ {
+ dir = 0;
+ d = 0.5L;
+ } else if ( dir < -1 )
+ d = 0.5L * d;
+ else
+ d = (y0 - yl)/(yh - yl);
+ dir -= 1;
+ }
+ }
+ /+
+ if ( x == 0.0L )
+ mtherr( "igamil", UNDERFLOW );
+ +/
+ return x;
+}
+
+@safe unittest
+{
+//Values from Excel's GammaInv(1-p, x, 1)
+assert(fabs(gammaIncompleteComplInv(1, 0.5) - 0.693147188044814) < 0.00000005);
+assert(fabs(gammaIncompleteComplInv(12, 0.99) - 5.42818075054289) < 0.00000005);
+assert(fabs(gammaIncompleteComplInv(100, 0.8) - 91.5013985848288L) < 0.000005);
+assert(gammaIncomplete(1, 0)==0);
+assert(gammaIncompleteCompl(1, 0)==1);
+assert(gammaIncomplete(4545, real.infinity)==1);
+
+// Values from Excel's (1-GammaDist(x, alpha, 1, TRUE))
+
+assert(fabs(1.0L-gammaIncompleteCompl(0.5, 2) - 0.954499729507309L) < 0.00000005);
+assert(fabs(gammaIncomplete(0.5, 2) - 0.954499729507309L) < 0.00000005);
+// Fixed Cephes bug:
+assert(gammaIncompleteCompl(384, real.infinity)==0);
+assert(gammaIncompleteComplInv(3, 0)==real.infinity);
+// Fixed a bug that caused gammaIncompleteCompl to return a wrong value when
+// x was larger than a, but not by much, and both were large:
+// The value is from WolframAlpha (Gamma[100000, 100001, inf] / Gamma[100000])
+static if (real.mant_dig >= 64) // incl. 80-bit reals
+ assert(fabs(gammaIncompleteCompl(100000, 100001) - 0.49831792109) < 0.000000000005);
+else
+ assert(fabs(gammaIncompleteCompl(100000, 100001) - 0.49831792109) < 0.00000005);
+}
+
+
+// DAC: These values are Bn / n for n=2,4,6,8,10,12,14.
+immutable real [7] Bn_n = [
+ 1.0L/(6*2), -1.0L/(30*4), 1.0L/(42*6), -1.0L/(30*8),
+ 5.0L/(66*10), -691.0L/(2730*12), 7.0L/(6*14) ];
+
+/** Digamma function
+*
+* The digamma function is the logarithmic derivative of the gamma function.
+*
+* digamma(x) = d/dx logGamma(x)
+*
+* References:
+* 1. Abramowitz, M., and Stegun, I. A. (1970).
+* Handbook of mathematical functions. Dover, New York,
+* pages 258-259, equations 6.3.6 and 6.3.18.
+*/
+real digamma(real x)
+{
+ // Based on CEPHES, Stephen L. Moshier.
+
+ real p, q, nz, s, w, y, z;
+ long i, n;
+ int negative;
+
+ negative = 0;
+ nz = 0.0;
+
+ if ( x <= 0.0 )
+ {
+ negative = 1;
+ q = x;
+ p = floor(q);
+ if ( p == q )
+ {
+ return real.nan; // singularity.
+ }
+ /* Remove the zeros of tan(PI x)
+ * by subtracting the nearest integer from x
+ */
+ nz = q - p;
+ if ( nz != 0.5 )
+ {
+ if ( nz > 0.5 )
+ {
+ p += 1.0;
+ nz = q - p;
+ }
+ nz = PI/tan(PI*nz);
+ }
+ else
+ {
+ nz = 0.0;
+ }
+ x = 1.0 - x;
+ }
+
+ // check for small positive integer
+ if ((x <= 13.0) && (x == floor(x)) )
+ {
+ y = 0.0;
+ n = lrint(x);
+ // DAC: CEPHES bugfix. Cephes did this in reverse order, which
+ // created a larger roundoff error.
+ for (i=n-1; i>0; --i)
+ {
+ y+=1.0L/i;
+ }
+ y -= EULERGAMMA;
+ goto done;
+ }
+
+ s = x;
+ w = 0.0;
+ while ( s < 10.0 )
+ {
+ w += 1.0/s;
+ s += 1.0;
+ }
+
+ if ( s < 1.0e17 )
+ {
+ z = 1.0/(s * s);
+ y = z * poly(z, Bn_n);
+ } else
+ y = 0.0;
+
+ y = log(s) - 0.5L/s - y - w;
+
+done:
+ if ( negative )
+ {
+ y -= nz;
+ }
+ return y;
+}
+
+@safe unittest
+{
+ // Exact values
+ assert(digamma(1.0)== -EULERGAMMA);
+ assert(feqrel(digamma(0.25), -PI/2 - 3* LN2 - EULERGAMMA) >= real.mant_dig-7);
+ assert(feqrel(digamma(1.0L/6), -PI/2 *sqrt(3.0L) - 2* LN2 -1.5*log(3.0L) - EULERGAMMA) >= real.mant_dig-7);
+ assert(digamma(-5.0).isNaN());
+ assert(feqrel(digamma(2.5), -EULERGAMMA - 2*LN2 + 2.0 + 2.0L/3) >= real.mant_dig-9);
+ assert(isIdentical(digamma(NaN(0xABC)), NaN(0xABC)));
+
+ for (int k=1; k<40; ++k)
+ {
+ real y=0;
+ for (int u=k; u >= 1; --u)
+ {
+ y += 1.0L/u;
+ }
+ assert(feqrel(digamma(k+1.0), -EULERGAMMA + y) >= real.mant_dig-2);
+ }
+}
+
+/** Log Minus Digamma function
+*
+* logmdigamma(x) = log(x) - digamma(x)
+*
+* References:
+* 1. Abramowitz, M., and Stegun, I. A. (1970).
+* Handbook of mathematical functions. Dover, New York,
+* pages 258-259, equations 6.3.6 and 6.3.18.
+*/
+real logmdigamma(real x)
+{
+ if (x <= 0.0)
+ {
+ if (x == 0.0)
+ {
+ return real.infinity;
+ }
+ return real.nan;
+ }
+
+ real s = x;
+ real w = 0.0;
+ while ( s < 10.0 )
+ {
+ w += 1.0/s;
+ s += 1.0;
+ }
+
+ real y;
+ if ( s < 1.0e17 )
+ {
+ immutable real z = 1.0/(s * s);
+ y = z * poly(z, Bn_n);
+ } else
+ y = 0.0;
+
+ return x == s ? y + 0.5L/s : (log(x/s) + 0.5L/s + y + w);
+}
+
+@safe unittest
+{
+ assert(logmdigamma(-5.0).isNaN());
+ assert(isIdentical(logmdigamma(NaN(0xABC)), NaN(0xABC)));
+ assert(logmdigamma(0.0) == real.infinity);
+ for (auto x = 0.01; x < 1.0; x += 0.1)
+ assert(approxEqual(digamma(x), log(x) - logmdigamma(x)));
+ for (auto x = 1.0; x < 15.0; x += 1.0)
+ assert(approxEqual(digamma(x), log(x) - logmdigamma(x)));
+}
+
+/** Inverse of the Log Minus Digamma function
+ *
+ * Returns x such $(D log(x) - digamma(x) == y).
+ *
+ * References:
+ * 1. Abramowitz, M., and Stegun, I. A. (1970).
+ * Handbook of mathematical functions. Dover, New York,
+ * pages 258-259, equation 6.3.18.
+ *
+ * Authors: Ilya Yaroshenko
+ */
+real logmdigammaInverse(real y)
+{
+ import std.numeric : findRoot;
+ // FIXME: should be returned back to enum.
+ // Fix requires CTFEable `log` on non-x86 targets (check both LDC and GDC).
+ immutable maxY = logmdigamma(real.min_normal);
+ assert(maxY > 0 && maxY <= real.max);
+
+ if (y >= maxY)
+ {
+ //lim x->0 (log(x)-digamma(x))*x == 1
+ return 1 / y;
+ }
+ if (y < 0)
+ {
+ return real.nan;
+ }
+ if (y < real.min_normal)
+ {
+ //6.3.18
+ return 0.5 / y;
+ }
+ if (y > 0)
+ {
+ // x/2 <= logmdigamma(1 / x) <= x, x > 0
+ // calls logmdigamma ~6 times
+ return 1 / findRoot((real x) => logmdigamma(1 / x) - y, y, 2*y);
+ }
+ return y; //NaN
+}
+
+@safe unittest
+{
+ import std.typecons;
+ //WolframAlpha, 22.02.2015
+ immutable Tuple!(real, real)[5] testData = [
+ tuple(1.0L, 0.615556766479594378978099158335549201923L),
+ tuple(1.0L/8, 4.15937801516894947161054974029150730555L),
+ tuple(1.0L/1024, 512.166612384991507850643277924243523243L),
+ tuple(0.000500083333325000003968249801594877323784632117L, 1000.0L),
+ tuple(1017.644138623741168814449776695062817947092468536L, 1.0L/1024),
+ ];
+ foreach (test; testData)
+ assert(approxEqual(logmdigammaInverse(test[0]), test[1], 2e-15, 0));
+
+ assert(approxEqual(logmdigamma(logmdigammaInverse(1)), 1, 1e-15, 0));
+ assert(approxEqual(logmdigamma(logmdigammaInverse(real.min_normal)), real.min_normal, 1e-15, 0));
+ assert(approxEqual(logmdigamma(logmdigammaInverse(real.max/2)), real.max/2, 1e-15, 0));
+ assert(approxEqual(logmdigammaInverse(logmdigamma(1)), 1, 1e-15, 0));
+ assert(approxEqual(logmdigammaInverse(logmdigamma(real.min_normal)), real.min_normal, 1e-15, 0));
+ assert(approxEqual(logmdigammaInverse(logmdigamma(real.max/2)), real.max/2, 1e-15, 0));
+}