1 files changed, 1353 insertions, 0 deletions
diff --git a/libphobos/src/std/internal/math/biguintx86.d b/libphobos/src/std/internal/math/biguintx86.d
new file mode 100644
index 0000000..bd03d2e
--- /dev/null
+++ b/libphobos/src/std/internal/math/biguintx86.d
@@ -0,0 +1,1353 @@
+/** Optimised asm arbitrary precision arithmetic ('bignum')
+ * routines for X86 processors.
+ *
+ * All functions operate on arrays of uints, stored LSB first.
+ * If there is a destination array, it will be the first parameter.
+ * Currently, all of these functions are subject to change, and are
+ * intended for internal use only.
+ * The symbol [#] indicates an array of machine words which is to be
+ * interpreted as a multi-byte number.
+ */
+
+/*          Copyright Don Clugston 2008 - 2010.
+ * Distributed under the Boost Software License, Version 1.0.
+ *    (See accompanying file LICENSE_1_0.txt or copy at
+ *          http://www.boost.org/LICENSE_1_0.txt)
+ */
+/**
+ * In simple terms, there are 3 modern x86 microarchitectures:
+ * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
+ * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
+ * (c) the Pentium 4, produced by Marketing.
+ *
+ * This code has been optimised for the Intel P6 family.
+ * Generally the code remains near-optimal for Intel Core2/Corei7, after
+ * translating EAX-> RAX, etc, since all these CPUs use essentially the same
+ * pipeline, and are typically limited by memory access.
+ * The code uses techniques described in Agner Fog's superb Pentium manuals
+ * available at www.agner.org.
+ * Not optimised for AMD, which can do two memory loads per cycle (Intel
+ * CPUs can only do one). Despite this, performance is superior on AMD.
+ * Performance is dreadful on P4.
+ *
+ *  Timing results (cycles per int)
+ *              --Intel Pentium--  --AMD--
+ *              PM     P4   Core2   K7
+ *  +,-         2.25  15.6   2.25   1.5
+ *  <<,>>       2.0    6.6   2.0    5.0
+ *    (<< MMX)  1.7    5.3   1.5    1.2
+ *  *           5.0   15.0   4.0    4.3
+ *  mulAdd      5.7   19.0   4.9    4.0
+ *  div        30.0   32.0  32.0   22.4
+ *  mulAcc(32)  6.5   20.0   5.4    4.9
+ *
+ * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
+ * function call overhead.
+ * The timing for Div is quite unpredictable, but it's probably too slow
+ * to be useful. On 64-bit processors, these times should
+ * halve if run in 64-bit mode, except for the MMX functions.
+ */
+
+module std.internal.math.biguintx86;
+
+@system:
+pure:
+nothrow:
+
+/*
+  Naked asm is used throughout, because:
+  (a) it frees up the EBP register
+  (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
+*/
+
+version (D_InlineAsm_X86)
+{
+
+private:
+
+/* Duplicate string s, with n times, substituting index for '@'.
+ *
+ * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
+ * function for some of the asm routines.
+ */
+string indexedLoopUnroll(int n, string s) pure @safe
+{
+    string u;
+    for (int i = 0; i<n; ++i)
+    {
+        string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
+
+        int last = 0;
+        for (int j = 0; j<s.length; ++j)
+        {
+            if (s[j]=='@')
+            {
+                u ~= s[last .. j] ~ nstr;
+                last = j+1;
+            }
+        }
+        if (last<s.length) u = u ~ s[last..$];
+
+    }
+    return u;
+}
+@safe unittest
+{
+    assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
+}
+
+public:
+
+alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
+
+// Limits for when to switch between multiplication algorithms.
+enum : int { KARATSUBALIMIT = 18 }; // Minimum value for which Karatsuba is worthwhile.
+enum : int { KARATSUBASQUARELIMIT=26 }; // Minimum value for which square Karatsuba is worthwhile
+
+/** Multi-byte addition or subtraction
+ *    dest[#] = src1[#] + src2[#] + carry (0 or 1).
+ * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
+ * Returns carry or borrow (0 or 1).
+ * Set op == '+' for addition, '-' for subtraction.
+ */
+uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
+        src2, uint carry) pure
+{
+    // Timing:
+    // Pentium M: 2.25/int
+    // P6 family, Core2 have a partial flags stall when reading the carry flag in
+    // an ADC, SBB operation after an operation such as INC or DEC which
+    // modifies some, but not all, flags. We avoid this by storing carry into
+    // a resister (AL), and restoring it after the branch.
+
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push EDI;
+        push EBX;
+        push ESI;
+        mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
+        mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
+        mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
+        mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
+             // Carry is in EAX
+        // Count UP to zero (from -len) to minimize loop overhead.
+        lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
+        lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
+        lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
+
+        neg ECX;
+        add ECX, 8;
+        jb L2;  // if length < 8 , bypass the unrolled loop.
+L_unrolled:
+        shr AL, 1; // get carry from EAX
+    }
+    mixin(" asm pure nothrow {"
+        ~ indexedLoopUnroll( 8,
+        "mov EAX, [@*4-8*4+EDX+ECX*4];"
+        ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
+        ~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
+        ~ "}");
+    asm pure nothrow {
+        setc AL; // save carry
+        add ECX, 8;
+        ja L_unrolled;
+L2:     // Do the residual 1 .. 7 ints.
+
+        sub ECX, 8;
+        jz done;
+L_residual:
+        shr AL, 1; // get carry from EAX
+    }
+    mixin(" asm pure nothrow {"
+        ~ indexedLoopUnroll( 1,
+        "mov EAX, [@*4+EDX+ECX*4];"
+        ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
+        ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
+    asm pure nothrow {
+        setc AL; // save carry
+        add ECX, 1;
+        jnz L_residual;
+done:
+        and EAX, 1; // make it O or 1.
+        pop ESI;
+        pop EBX;
+        pop EDI;
+        ret 6*4;
+    }
+}
+
+@system unittest
+{
+    uint [] a = new uint[40];
+    uint [] b = new uint[40];
+    uint [] c = new uint[40];
+    for (int i=0; i<a.length; ++i)
+    {
+        if (i&1) a[i]=0x8000_0000 + i;
+        else a[i]=i;
+        b[i]= 0x8000_0003;
+    }
+    c[19]=0x3333_3333;
+    uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
+    assert(carry == 1);
+    assert(c[0]==0x8000_0003);
+    assert(c[1]==4);
+    assert(c[19]==0x3333_3333); // check for overrun
+    for (int i=0; i<a.length; ++i)
+    {
+        a[i]=b[i]=c[i]=0;
+    }
+    a[8]=0x048D159E;
+    b[8]=0x048D159E;
+    a[10]=0x1D950C84;
+    b[10]=0x1D950C84;
+    a[5] =0x44444444;
+    carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
+    assert(a[11]==0);
+    for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
+
+    for (int q=3; q<36;++q)
+    {
+        for (int i=0; i<a.length; ++i)
+        {
+            a[i]=b[i]=c[i]=0;
+        }
+        a[q-2]=0x040000;
+        b[q-2]=0x040000;
+       carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
+       assert(a[q-2]==0);
+    }
+}
+
+/** dest[#] += carry, or dest[#] -= carry.
+ *  op must be '+' or '-'
+ *  Returns final carry or borrow (0 or 1)
+ */
+uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure
+{
+    enum { LASTPARAM = 1*4 } // 0* pushes + return address.
+    asm pure nothrow {
+        naked;
+        mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
+        mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
+        // EAX  = carry
+L1: ;
+    }
+    static if (op=='+')
+        asm pure nothrow { add [EDX], EAX; }
+    else
+        asm pure nothrow { sub [EDX], EAX; }
+    asm pure nothrow {
+        mov EAX, 1;
+        jnc L2;
+        add EDX, 4;
+        dec ECX;
+        jnz L1;
+        mov EAX, 2;
+L2:     dec EAX;
+        ret 2*4;
+    }
+}
+
+/** dest[#] = src[#] << numbits
+ *  numbits must be in the range 1 .. 31
+ *  Returns the overflow
+ */
+uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure
+{
+    // Timing: Optimal for P6 family.
+    // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
+    // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        mov ECX, EAX; // numbits;
+
+        mov EAX, [-4+ESI + 4*EBX];
+        mov EDX, 0;
+        shld EDX, EAX, CL;
+        push EDX; // Save return value
+        cmp EBX, 1;
+        jz L_last;
+        mov EDX, [-4+ESI + 4*EBX];
+        test EBX, 1;
+        jz L_odd;
+        sub EBX, 1;
+L_even:
+        mov EDX, [-4+ ESI + 4*EBX];
+        shld EAX, EDX, CL;
+        mov [EDI+4*EBX], EAX;
+L_odd:
+        mov EAX, [-8+ESI + 4*EBX];
+        shld EDX, EAX, CL;
+        mov [-4+EDI + 4*EBX], EDX;
+        sub EBX, 2;
+        jg L_even;
+L_last:
+        shl EAX, CL;
+        mov [EDI], EAX;
+        pop EAX; // pop return value
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+    }
+}
+
+/** dest[#] = src[#] >> numbits
+ *  numbits must be in the range 1 .. 31
+ * This version uses MMX.
+ */
+uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure
+{
+    // Timing:
+    // K7 1.2/int. PM 1.7/int P4 5.3/int
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+
+        movd MM3, EAX; // numbits = bits to shift left
+        xor EAX, 63;
+        align   16;
+        inc EAX;
+        movd MM4, EAX ; // 64-numbits = bits to shift right
+
+        // Get the return value into EAX
+        and EAX, 31; // EAX = 32-numbits
+        movd MM2, EAX; // 32-numbits
+        movd MM1, [ESI+4*EBX-4];
+        psrlq MM1, MM2;
+        movd EAX, MM1;  // EAX = return value
+        test EBX, 1;
+        jz L_even;
+L_odd:
+        cmp EBX, 1;
+        jz L_length1;
+
+         // deal with odd lengths
+        movq MM1, [ESI+4*EBX-8];
+        psrlq MM1, MM2;
+        movd    [EDI +4*EBX-4], MM1;
+        sub EBX, 1;
+L_even: // It's either singly or doubly even
+        movq    MM2, [ESI + 4*EBX - 8];
+        psllq   MM2, MM3;
+        sub EBX, 2;
+        jle L_last;
+        movq MM1, MM2;
+        add EBX, 2;
+        test EBX, 2;
+        jz L_onceeven;
+        sub EBX, 2;
+
+        // MAIN LOOP -- 128 bytes per iteration
+ L_twiceeven:      // here MM2 is the carry
+        movq    MM0, [ESI + 4*EBX-8];
+        psrlq   MM0, MM4;
+        movq    MM1, [ESI + 4*EBX-8];
+        psllq   MM1, MM3;
+        por     MM2, MM0;
+        movq    [EDI +4*EBX], MM2;
+L_onceeven:        // here MM1 is the carry
+        movq    MM0, [ESI + 4*EBX-16];
+        psrlq   MM0, MM4;
+        movq    MM2, [ESI + 4*EBX-16];
+        por     MM1, MM0;
+        movq    [EDI +4*EBX-8], MM1;
+        psllq   MM2, MM3;
+        sub EBX, 4;
+        jg L_twiceeven;
+L_last:
+        movq    [EDI +4*EBX], MM2;
+L_alldone:
+        emms;  // NOTE: costs 6 cycles on Intel CPUs
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+
+L_length1:
+        // length 1 is a special case
+        movd MM1, [ESI];
+        psllq MM1, MM3;
+        movd [EDI], MM1;
+        jmp L_alldone;
+    }
+}
+
+void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure
+{
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+align 16;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        neg EBX;                // count UP to zero.
+
+        movd MM3, EAX; // numbits = bits to shift right
+        xor EAX, 63;
+        inc EAX;
+        movd MM4, EAX ; // 64-numbits = bits to shift left
+
+        test EBX, 1;
+        jz L_even;
+L_odd:
+         // deal with odd lengths
+        and EAX, 31; // EAX = 32-numbits
+        movd MM2, EAX; // 32-numbits
+        cmp EBX, -1;
+        jz L_length1;
+
+        movq MM0, [ESI+4*EBX];
+        psrlq MM0, MM3;
+        movd    [EDI +4*EBX], MM0;
+        add EBX, 1;
+L_even:
+        movq    MM2, [ESI + 4*EBX];
+        psrlq   MM2, MM3;
+
+        movq MM1, MM2;
+        add EBX, 4;
+        cmp EBX, -2+4;
+        jz L_last;
+        // It's either singly or doubly even
+        sub EBX, 2;
+        test EBX, 2;
+        jnz L_onceeven;
+        add EBX, 2;
+
+        // MAIN LOOP -- 128 bytes per iteration
+ L_twiceeven:      // here MM2 is the carry
+        movq    MM0, [ESI + 4*EBX-8];
+        psllq   MM0, MM4;
+        movq    MM1, [ESI + 4*EBX-8];
+        psrlq   MM1, MM3;
+        por     MM2, MM0;
+        movq    [EDI +4*EBX-16], MM2;
+L_onceeven:        // here MM1 is the carry
+        movq    MM0, [ESI + 4*EBX];
+        psllq   MM0, MM4;
+        movq    MM2, [ESI + 4*EBX];
+        por     MM1, MM0;
+        movq    [EDI +4*EBX-8], MM1;
+        psrlq   MM2, MM3;
+        add EBX, 4;
+        jl L_twiceeven;
+L_last:
+        movq    [EDI +4*EBX-16], MM2;
+L_alldone:
+        emms;  // NOTE: costs 6 cycles on Intel CPUs
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+
+L_length1:
+        // length 1 is a special case
+        movd MM1, [ESI+4*EBX];
+        psrlq MM1, MM3;
+        movd    [EDI +4*EBX], MM1;
+        jmp L_alldone;
+
+    }
+}
+
+/** dest[#] = src[#] >> numbits
+ *  numbits must be in the range 1 .. 31
+ */
+void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure
+{
+    // Timing: Optimal for P6 family.
+    // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
+    // Terrible performance on AMD64, which has 7 cycles for SHRD!!
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        mov ECX, EAX; // numbits;
+
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        neg EBX;                // count UP to zero.
+        mov EAX, [ESI + 4*EBX];
+        cmp EBX, -1;
+        jz L_last;
+        mov EDX, [ESI + 4*EBX];
+        test EBX, 1;
+        jz L_odd;
+        add EBX, 1;
+L_even:
+        mov EDX, [ ESI + 4*EBX];
+        shrd EAX, EDX, CL;
+        mov [-4 + EDI+4*EBX], EAX;
+L_odd:
+        mov EAX, [4 + ESI + 4*EBX];
+        shrd EDX, EAX, CL;
+        mov [EDI + 4*EBX], EDX;
+        add EBX, 2;
+        jl L_even;
+L_last:
+        shr EAX, CL;
+        mov [-4 + EDI], EAX;
+
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+    }
+}
+
+@system unittest
+{
+
+    uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShr(aa[0..$-1], aa, 4);
+    assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
+        && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
+
+    aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShr(aa[2..$-1], aa[2..$-1], 4);
+    assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
+        && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
+
+    aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShr(aa[0..$-2], aa, 4);
+    assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
+    assert(aa[0]==0x6122_2222);
+    assert(aa[3]==0xBCCC_CCCD);
+
+
+    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
+    assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
+        && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
+    assert(r == 8);
+
+    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
+    assert(aa[0] == 0xF0FF_FFFF
+        && aa[2]==0x5555_5561);
+        assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
+    assert(r == 8);
+        assert(aa[1]==0x2222_2230);
+
+    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
+}
+
+/** dest[#] = src[#] * multiplier + carry.
+ * Returns carry.
+ */
+uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
+    pure
+{
+    // Timing: definitely not optimal.
+    // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
+    // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
+
+    enum { LASTPARAM = 4*4 } // 4* pushes + return address.
+    // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+    // when initializing variables to zero.
+    version (D_PIC)
+    {
+        enum { zero = 0 }
+    }
+    else
+    {
+        __gshared int zero = 0;
+    }
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+
+        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
+        align 16;
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        mov ECX, EAX; // [carry]; -- last param is in EAX.
+        neg EBX;                // count UP to zero.
+        test EBX, 1;
+        jnz L_odd;
+        add EBX, 1;
+ L1:
+        mov EAX, [-4 + ESI + 4*EBX];
+        mul int ptr [ESP+LASTPARAM]; //[multiplier];
+        add EAX, ECX;
+        mov ECX, zero;
+        mov [-4+EDI + 4*EBX], EAX;
+        adc ECX, EDX;
+L_odd:
+        mov EAX, [ESI + 4*EBX];  // p2
+        mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
+        add EAX, ECX;
+        mov ECX, zero;
+        adc ECX, EDX;
+        mov [EDI + 4*EBX], EAX;
+        add EBX, 2;
+        jl L1;
+
+        mov EAX, ECX; // get final carry
+
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 5*4;
+    }
+}
+
+@system unittest
+{
+    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
+    assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
+        aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+// The inner multiply-and-add loop, together with the Even entry point.
+// Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
+// This is the most time-critical code in the BigInt library.
+// It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
+string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure {
+    // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
+    // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
+    // (unit p2) for initializing registers to zero.
+    // There are also dependencies between the instructions, and we run up against the
+    // ROB-read limit (can only read 2 registers per cycle).
+    // We also need the number of uops in the loop to be a multiple of 3.
+    // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
+        // if Position-Independent Code is required.
+
+        // Register usage
+    // ESI = end of src
+    // EDI = end of dest
+    // EBX = index. Counts up to zero (in steps of 2).
+    // EDX:EAX = scratch, used in multiply.
+    // ECX = carry1.
+    // EBP = carry2.
+        // ESP = points to the multiplier.
+
+        // The first member of 'dest' which will be modified is [EDI+4*EBX].
+        // EAX must already contain the first member of 'src', [ESI+4*EBX].
+
+    version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
+    return "
+        // Entry point for even length
+        add EBX, 1;
+        mov EBP, ECX; // carry
+
+        mul int ptr [" ~ M_ADDRESS ~ "]; // M
+        mov ECX, 0;
+
+        add EBP, EAX;
+        mov EAX, [ESI+4*EBX];
+        adc ECX, EDX;
+
+        mul int ptr [" ~ M_ADDRESS ~ "]; // M
+        " ~ OP ~ " [-4+EDI+4*EBX], EBP;
+        mov EBP, zero;
+
+        adc ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+
+        adc EBP, EDX;
+        add EBX, 2;
+        jnl L_done;
+L1:
+        mul int ptr [" ~ M_ADDRESS ~ "];
+        " ~ OP ~ " [-8+EDI+4*EBX], ECX;
+        adc EBP, EAX;
+        mov ECX, zero;
+        mov EAX, [ESI+4*EBX];
+        adc ECX, EDX;
+" ~
+        (using_PIC ? "" : "   mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
+~ "
+        mul int ptr [" ~ M_ADDRESS ~ "];
+        " ~ OP ~ " [-4+EDI+4*EBX], EBP;
+        mov EBP, zero;
+
+        adc ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+
+        adc EBP, EDX;
+        add EBX, 2;
+        jl L1;
+L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
+        adc EBP, 0;
+";
+                // final carry is now in EBP
+}
+
+string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure
+{
+    return "
+        mul int ptr [" ~M_ADDRESS ~"];
+        mov EBP, zero;
+        add ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+
+        adc EBP, EDX;
+        add EBX, 2;
+        jl L1;
+        jmp L_done;
+";
+}
+
+
+
+/**
+ * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
+ * where op == '+' or '-'
+ * Returns carry out of MSB (0 .. FFFF_FFFF).
+ */
+uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
+        multiplier, uint carry) pure {
+    // Timing: This is the most time-critical bignum function.
+    // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
+
+    // The main loop is pipelined and unrolled by 2,
+    //   so entry to the loop is also complicated.
+
+    // Register usage
+    // EDX:EAX = multiply
+    // EBX = counter
+    // ECX = carry1
+    // EBP = carry2
+    // EDI = dest
+    // ESI = src
+
+    enum string OP = (op=='+')? "add" : "sub";
+    version (D_PIC)
+    {
+        enum { zero = 0 }
+    }
+    else
+    {
+        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+        // when initializing registers to zero.
+        __gshared int zero = 0;
+        // use p3/p4 units
+        __gshared int storagenop; // write-only
+    }
+
+    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+    asm pure nothrow {
+        naked;
+
+        push ESI;
+        push EDI;
+        push EBX;
+        push EBP;
+        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+        align 16;
+        nop;
+        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        mov EBP, 0;
+        mov ECX, EAX; // ECX = input carry.
+        neg EBX;                // count UP to zero.
+        mov EAX, [ESI+4*EBX];
+        test EBX, 1;
+        jnz L_enter_odd;
+    }
+    // Main loop, with entry point for even length
+    mixin("asm pure nothrow {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
+    asm pure nothrow {
+        mov EAX, EBP; // get final carry
+        pop EBP;
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 5*4;
+    }
+L_enter_odd:
+    mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
+}
+
+@system unittest
+{
+
+    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
+    multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
+    assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
+    assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
+         && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
+}
+
+/**
+   Sets result[#] = result[0 .. left.length] + left[#] * right[#]
+
+   It is defined in this way to allow cache-efficient multiplication.
+   This function is equivalent to:
+    ----
+    for (int i = 0; i< right.length; ++i)
+    {
+        dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
+                left, right[i], 0);
+    }
+    ----
+ */
+void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
+        const uint [] right) pure {
+    // Register usage
+    // EDX:EAX = used in multiply
+    // EBX = index
+    // ECX = carry1
+    // EBP = carry2
+    // EDI = end of dest for this pass through the loop. Index for outer loop.
+    // ESI = end of left. never changes
+    // [ESP] = M = right[i] = multiplier for this pass through the loop.
+    // right.length is changed into dest.ptr+dest.length
+    version (D_PIC)
+    {
+        enum { zero = 0 }
+    }
+    else
+    {
+        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+        // when initializing registers to zero.
+        __gshared int zero = 0;
+        // use p3/p4 units
+        __gshared int storagenop; // write-only
+    }
+
+    enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
+    asm pure nothrow {
+        naked;
+
+        push ESI;
+        push EDI;
+        align 16;
+        push EBX;
+        push EBP;
+        push EAX;    // local variable M
+        mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
+        mov ESI, [ESP + LASTPARAM + 4*3];  // left.ptr
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
+
+        mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
+        lea EAX, [EDI + 4*EAX];
+        mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
+
+        lea ESI, [ESI + 4*EBX]; // ESI = end of left
+        mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
+        mov EAX, [EAX];
+        mov [ESP], EAX; // M
+outer_loop:
+        mov EBP, 0;
+        mov ECX, 0; // ECX = input carry.
+        neg EBX;                // count UP to zero.
+        mov EAX, [ESI+4*EBX];
+        test EBX, 1;
+        jnz L_enter_odd;
+    }
+    // -- Inner loop, with even entry point
+    mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
+    asm pure nothrow {
+        mov [-4+EDI+4*EBX], EBP;
+        add EDI, 4;
+        cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
+        jz outer_done;
+        mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
+        mov EAX, [EAX+4];                 // get new M
+        mov [ESP], EAX;                   // save new M
+        add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
+        mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
+        jmp outer_loop;
+outer_done:
+        pop EAX;
+        pop EBP;
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 6*4;
+    }
+L_enter_odd:
+    mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
+}
+
+/**  dest[#] /= divisor.
+ * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
+ * divisor must not be a power of 2 (use right shift for that case;
+ * A division by zero will occur if divisor is a power of 2).
+ * Returns the final remainder
+ *
+ * Based on public domain code by Eric Bainville.
+ * (http://www.bealto.com/) Used with permission.
+ */
+uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure
+{
+    // Timing: limited by a horrible dependency chain.
+    // Pentium M: 18 cycles/op, 8 resource stalls/op.
+    // EAX, EDX = scratch, used by MUL
+    // EDI = dest
+    // CL = shift
+    // ESI = quotient
+    // EBX = remainderhi
+    // EBP = remainderlo
+    // [ESP-4] = mask
+    // [ESP] = kinv (2^64 /divisor)
+    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+    enum { LOCALS = 2*4} // MASK, KINV
+    asm pure nothrow {
+        naked;
+
+        push ESI;
+        push EDI;
+        push EBX;
+        push EBP;
+
+        mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
+
+        // Loop from msb to lsb
+        lea     EDI, [EDI + 4*EBX];
+        mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
+        // Build the pseudo-inverse of divisor k: 2^64/k
+        // First determine the shift in ecx to get the max number of bits in kinv
+        xor     ECX, ECX;
+        mov     EAX, [ESP + LASTPARAM]; //divisor;
+        mov     EDX, 1;
+kinv1:
+        inc     ECX;
+        ror     EDX, 1;
+        shl     EAX, 1;
+        jnc     kinv1;
+        dec     ECX;
+        // Here, ecx is a left shift moving the msb of k to bit 32
+
+        mov     EAX, 1;
+        shl     EAX, CL;
+        dec     EAX;
+        ror     EAX, CL ; //ecx bits at msb
+        push    EAX; // MASK
+
+        // Then divide 2^(32+cx) by divisor (edx already ok)
+        xor     EAX, EAX;
+        div     int ptr [ESP + LASTPARAM +  LOCALS-4*1]; //divisor;
+        push    EAX; // kinv
+        align   16;
+L2:
+        // Get 32 bits of quotient approx, multiplying
+        // most significant word of (rem*2^32+input)
+        mov     EAX, [ESP+4]; //MASK;
+        and     EAX, [EDI - 4];
+        or      EAX, EBP;
+        rol     EAX, CL;
+        mov     EBX, EBP;
+        mov     EBP, [EDI - 4];
+        mul     int ptr [ESP]; //KINV;
+
+        shl     EAX, 1;
+        rcl     EDX, 1;
+
+        // Multiply by k and subtract to get remainder
+        // Subtraction must be done on two words
+        mov     EAX, EDX;
+        mov     ESI, EDX; // quot = high word
+        mul     int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
+        sub     EBP, EAX;
+        sbb     EBX, EDX;
+        jz      Lb;  // high word is 0, goto adjust on single word
+
+        // Adjust quotient and remainder on two words
+Ld:     inc     ESI;
+        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        sbb     EBX, 0;
+        jnz     Ld;
+
+        // Adjust quotient and remainder on single word
+Lb:     cmp     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        jc      Lc; // rem in 0 .. divisor-1, OK
+        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        inc     ESI;
+        jmp     Lb;
+
+        // Store result
+Lc:
+        mov     [EDI - 4], ESI;
+        lea     EDI, [EDI - 4];
+        dec     int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
+        jnz    L2;
+
+        pop EAX; // discard kinv
+        pop EAX; // discard mask
+
+        mov     EAX, EBP; // return final remainder
+        pop     EBP;
+        pop     EBX;
+        pop     EDI;
+        pop     ESI;
+        ret     3*4;
+    }
+}
+
+@system unittest
+{
+    uint [] aa = new uint[101];
+    for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
+    uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
+    uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
+    for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
+    assert(r == 0x33FF_7461);
+}
+
+// Set dest[2*i .. 2*i+1]+=src[i]*src[i]
+void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure
+{
+    /* Unlike mulAdd, the carry is only 1 bit,
+           since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
+           Note also that on the last iteration, no carry can occur.
+           As for multibyteAdd, we save & restore carry flag through the loop.
+
+           The timing is entirely dictated by the dependency chain. We could
+           improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
+    */
+    enum { LASTPARAM = 4*5 } // 4* pushes + return address.
+    asm pure nothrow {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+            push ECX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        lea EDI, [EDI + 8*EBX];      // EDI = end of dest
+        lea ESI, [ESI + 4*EBX];      // ESI = end of src
+        neg EBX;                     // count UP to zero.
+        xor ECX, ECX;             // initial carry = 0.
+L1:
+        mov EAX, [ESI + 4*EBX];
+        mul EAX, EAX;
+        shr CL, 1;                 // get carry
+        adc [EDI + 8*EBX], EAX;
+        adc [EDI + 8*EBX + 4], EDX;
+        setc CL;                   // save carry
+        inc EBX;
+        jnz L1;
+
+        pop ECX;
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+    }
+}
+
+@system unittest
+{
+    uint [] aa = new uint[13];
+        uint [] bb = new uint[6];
+    for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
+    for (int i=0; i<bb.length; ++i) bb[i] = i;
+        aa[$-1]= 7;
+    multibyteAddDiagonalSquares(aa[0..$-1], bb);
+        assert(aa[$-1]==7);
+        for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
+}
+
+void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure
+{
+    for (int i = 0; i < x.length-3; ++i)
+    {
+        dest[i+x.length] = multibyteMulAdd!('+')(
+             dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
+    }
+    ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
+    dest[$-5] = cast(uint) c;
+    c >>= 32;
+    c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
+    dest[$-4] = cast(uint) c;
+    c >>= 32;
+length2:
+    c += cast(ulong)(x[$-2]) * x[$-1];
+        dest[$-3] = cast(uint) c;
+        c >>= 32;
+        dest[$-2] = cast(uint) c;
+}
+
+//dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
+// assert(dest.length = src.length*2);
+// assert(src.length >= 3);
+void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure
+{
+    // Register usage
+    // EDX:EAX = used in multiply
+    // EBX = index
+    // ECX = carry1
+    // EBP = carry2
+    // EDI = end of dest for this pass through the loop. Index for outer loop.
+    // ESI = end of src. never changes
+    // [ESP] = M = src[i] = multiplier for this pass through the loop.
+    // dest.length is changed into dest.ptr+dest.length
+    version (D_PIC)
+    {
+        enum { zero = 0 }
+    }
+    else
+    {
+        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+        // when initializing registers to zero.
+        __gshared int zero = 0;
+        // use p3/p4 units
+        __gshared int storagenop; // write-only
+    }
+
+    enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
+    asm pure nothrow {
+        naked;
+
+        push ESI;
+        push EDI;
+        align 16;
+        push EBX;
+        push EBP;
+        push EAX;    // local variable M= src[i]
+        mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
+        mov ESI, [ESP + LASTPARAM + 4*1];  // src.ptr
+
+        lea ESI, [ESI + 4*EBX]; // ESI = end of left
+        add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
+
+        // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
+
+        lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
+        mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI  = &dest[src.length*2 -3]
+
+        cmp EBX, 3;
+        jz length_is_3;
+
+        // We start at src[1], not src[0].
+        dec EBX;
+        mov [ESP + LASTPARAM + 4*0], EBX;
+
+outer_loop:
+        mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
+        mov EBP, 0;
+        mov ECX, 0; // ECX = input carry.
+        dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
+        neg EBX;                // count UP to zero.
+
+        mov EAX, [ESI + 4*EBX - 4*1]; // get new M
+        mov [ESP], EAX;                   // save new M
+
+        mov EAX, [ESI+4*EBX];
+        test EBX, 1;
+        jnz L_enter_odd;
+    }
+    // -- Inner loop, with even entry point
+    mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
+    asm pure nothrow {
+        mov [-4+EDI+4*EBX], EBP;
+        add EDI, 4;
+        cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
+        jnz outer_loop;
+length_is_3:
+        mov EAX, [ESI - 4*3];
+        mul EAX, [ESI - 4*2];
+        mov ECX, 0;
+        add [EDI-2*4], EAX;  // ECX:dest[$-5] += x[$-3] * x[$-2]
+        adc ECX, EDX;
+
+        mov EAX, [ESI - 4*3];
+        mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
+        add EAX, ECX;
+        mov ECX, 0;
+        adc EDX, 0;
+        // now EDX: EAX = c + x[$-3] * x[$-1]
+        add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
+        adc ECX, EDX;  //  ECX holds dest[$-3], it acts as carry for the last row
+// do length == 2
+        mov EAX, [ESI - 4*2];
+        mul EAX, [ESI - 4*1];
+        add ECX, EAX;
+        adc EDX, 0;
+        mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
+        mov [EDI + 1*4], EDX;
+
+        pop EAX;
+        pop EBP;
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+    }
+L_enter_odd:
+    mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
+}
+
+@system unittest
+{
+   uint [] aa = new uint[200];
+   uint [] a  = aa[0 .. 100];
+   uint [] b  = new uint [100];
+   aa[] = 761;
+   a[] = 0;
+   b[] = 0;
+   a[3] = 6;
+   b[0]=1;
+   b[1] = 17;
+   b[50 .. 100]=78;
+   multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
+   uint [] c = new uint[100];
+   c[] = 0;
+   c[1] = 17;
+   c[3] = 6;
+   assert(a[]==c[]);
+   assert(a[0]==0);
+   aa[] = 0xFFFF_FFFF;
+   a[] = 0;
+   b[] = 0;
+   b[0]= 0xbf6a1f01;
+   b[1]=  0x6e38ed64;
+   b[2]=  0xdaa797ed;
+   b[3] = 0;
+
+   multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
+   assert(a[1]==0x3a600964);
+   assert(a[2]==0x339974f6);
+   assert(a[3]==0x46736fce);
+   assert(a[4]==0x5e24a2b4);
+
+   b[3] = 0xe93ff9f4;
+   b[4] = 0x184f03;
+   a[]=0;
+   multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
+   assert(a[3]==0x79fff5c2);
+   assert(a[4]==0xcf384241);
+   assert(a[5]== 0x4a17fc8);
+   assert(a[6]==0x4d549025);
+}
+
+
+void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure
+{
+    if (x.length < 4)
+    {
+        // Special cases, not worth doing triangular.
+        result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
+        multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
+        return;
+    }
+    //  Do half a square multiply.
+    //  dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
+    result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
+    multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
+    // Multiply by 2
+    result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
+    // And add the diagonal elements
+    result[0] = 0;
+    multibyteAddDiagonalSquares(result, x);
+}
+
+version (BignumPerformanceTest)
+{
+import core.stdc.stdio;
+int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
+
+__gshared uint [2200] X1;
+__gshared uint [2200] Y1;
+__gshared uint [4000] Z1;
+
+void testPerformance() pure
+{
+    // The performance results at the top of this file were obtained using
+    // a Windows device driver to access the CPU performance counters.
+    // The code below is less accurate but more widely usable.
+    // The value for division is quite inconsistent.
+    for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
+    int t, t0;
+    multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
+    t0 = clock();
+    multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
+    t = clock();
+    multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
+    auto shltime = (clock() - t) - (t - t0);
+    t0 = clock();
+    multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
+    t = clock();
+    multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
+    auto shrtime = (clock() - t) - (t - t0);
+    t0 = clock();
+    multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
+    t = clock();
+    multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
+    auto addtime = (clock() - t) - (t-t0);
+    t0 = clock();
+    multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
+    t = clock();
+    multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
+    auto multime = (clock() - t) - (t - t0);
+    multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
+    t0 = clock();
+    multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
+    t = clock();
+    multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
+    auto muladdtime = (clock() - t) - (t - t0);
+    multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
+    t = clock();
+    multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
+    auto accumtime = clock() - t;
+    t0 = clock();
+    multibyteDivAssign(Z1[0 .. 2000], 217, 0);
+    t = clock();
+    multibyteDivAssign(Z1[0 .. 1000], 37, 0);
+    auto divtime = (t - t0) - (clock() - t);
+        t= clock();
+    multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
+    auto squaretime = clock() - t;
+
+    printf("-- BigInt asm performance (cycles/int) --\n");
+    printf("Add:        %.2f\n", addtime/1000.0);
+    printf("Shl:        %.2f\n", shltime/1000.0);
+    printf("Shr:        %.2f\n", shrtime/1000.0);
+    printf("Mul:        %.2f\n", multime/1000.0);
+    printf("MulAdd:     %.2f\n", muladdtime/1000.0);
+    printf("Div:        %.2f\n", divtime/1000.0);
+    printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
+    printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
+}
+
+static this()
+{
+    testPerformance();
+}
+}
+
+} // version (D_InlineAsm_X86)