1 files changed, 3467 insertions, 0 deletions
diff --git a/libphobos/src/std/numeric.d b/libphobos/src/std/numeric.d
new file mode 100644
index 0000000..307406e
--- /dev/null
+++ b/libphobos/src/std/numeric.d
@@ -0,0 +1,3467 @@
+// Written in the D programming language.
+
+/**
+This module is a port of a growing fragment of the $(D_PARAM numeric)
+header in Alexander Stepanov's $(LINK2 http://sgi.com/tech/stl,
+Standard Template Library), with a few additions.
+
+Macros:
+Copyright: Copyright Andrei Alexandrescu 2008 - 2009.
+License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+Authors:   $(HTTP erdani.org, Andrei Alexandrescu),
+                   Don Clugston, Robert Jacques, Ilya Yaroshenko
+Source:    $(PHOBOSSRC std/_numeric.d)
+*/
+/*
+         Copyright Andrei Alexandrescu 2008 - 2009.
+Distributed under the Boost Software License, Version 1.0.
+   (See accompanying file LICENSE_1_0.txt or copy at
+         http://www.boost.org/LICENSE_1_0.txt)
+*/
+module std.numeric;
+
+import std.complex;
+import std.math;
+import std.range.primitives;
+import std.traits;
+import std.typecons;
+
+version (unittest)
+{
+    import std.stdio;
+}
+/// Format flags for CustomFloat.
+public enum CustomFloatFlags
+{
+    /// Adds a sign bit to allow for signed numbers.
+    signed = 1,
+
+    /**
+     * Store values in normalized form by default. The actual precision of the
+     * significand is extended by 1 bit by assuming an implicit leading bit of 1
+     * instead of 0. i.e. $(D 1.nnnn) instead of $(D 0.nnnn).
+     * True for all $(LINK2 https://en.wikipedia.org/wiki/IEEE_floating_point, IEE754) types
+     */
+    storeNormalized = 2,
+
+    /**
+     * Stores the significand in $(LINK2 https://en.wikipedia.org/wiki/IEEE_754-1985#Denormalized_numbers,
+     * IEEE754 denormalized) form when the exponent is 0. Required to express the value 0.
+     */
+    allowDenorm = 4,
+
+    /**
+      * Allows the storage of $(LINK2 https://en.wikipedia.org/wiki/IEEE_754-1985#Positive_and_negative_infinity,
+      * IEEE754 _infinity) values.
+      */
+    infinity = 8,
+
+    /// Allows the storage of $(LINK2 https://en.wikipedia.org/wiki/NaN, IEEE754 Not a Number) values.
+    nan = 16,
+
+    /**
+     * If set, select an exponent bias such that max_exp = 1.
+     * i.e. so that the maximum value is >= 1.0 and < 2.0.
+     * Ignored if the exponent bias is manually specified.
+     */
+    probability = 32,
+
+    /// If set, unsigned custom floats are assumed to be negative.
+    negativeUnsigned = 64,
+
+    /**If set, 0 is the only allowed $(LINK2 https://en.wikipedia.org/wiki/IEEE_754-1985#Denormalized_numbers,
+     * IEEE754 denormalized) number.
+     * Requires allowDenorm and storeNormalized.
+     */
+    allowDenormZeroOnly = 128 | allowDenorm | storeNormalized,
+
+    /// Include _all of the $(LINK2 https://en.wikipedia.org/wiki/IEEE_floating_point, IEEE754) options.
+    ieee = signed | storeNormalized | allowDenorm | infinity | nan ,
+
+    /// Include none of the above options.
+    none = 0
+}
+
+private template CustomFloatParams(uint bits)
+{
+    enum CustomFloatFlags flags = CustomFloatFlags.ieee
+                ^ ((bits == 80) ? CustomFloatFlags.storeNormalized : CustomFloatFlags.none);
+    static if (bits ==  8) alias CustomFloatParams = CustomFloatParams!( 4,  3, flags);
+    static if (bits == 16) alias CustomFloatParams = CustomFloatParams!(10,  5, flags);
+    static if (bits == 32) alias CustomFloatParams = CustomFloatParams!(23,  8, flags);
+    static if (bits == 64) alias CustomFloatParams = CustomFloatParams!(52, 11, flags);
+    static if (bits == 80) alias CustomFloatParams = CustomFloatParams!(64, 15, flags);
+}
+
+private template CustomFloatParams(uint precision, uint exponentWidth, CustomFloatFlags flags)
+{
+    import std.meta : AliasSeq;
+    alias CustomFloatParams =
+        AliasSeq!(
+            precision,
+            exponentWidth,
+            flags,
+            (1 << (exponentWidth - ((flags & flags.probability) == 0)))
+             - ((flags & (flags.nan | flags.infinity)) != 0) - ((flags & flags.probability) != 0)
+        ); // ((flags & CustomFloatFlags.probability) == 0)
+}
+
+/**
+ * Allows user code to define custom floating-point formats. These formats are
+ * for storage only; all operations on them are performed by first implicitly
+ * extracting them to $(D real) first. After the operation is completed the
+ * result can be stored in a custom floating-point value via assignment.
+ */
+template CustomFloat(uint bits)
+if (bits == 8 || bits == 16 || bits == 32 || bits == 64 || bits == 80)
+{
+    alias CustomFloat = CustomFloat!(CustomFloatParams!(bits));
+}
+
+/// ditto
+template CustomFloat(uint precision, uint exponentWidth, CustomFloatFlags flags = CustomFloatFlags.ieee)
+if (((flags & flags.signed) + precision + exponentWidth) % 8 == 0 && precision + exponentWidth > 0)
+{
+    alias CustomFloat = CustomFloat!(CustomFloatParams!(precision, exponentWidth, flags));
+}
+
+///
+@safe unittest
+{
+    import std.math : sin, cos;
+
+    // Define a 16-bit floating point values
+    CustomFloat!16                                x;     // Using the number of bits
+    CustomFloat!(10, 5)                           y;     // Using the precision and exponent width
+    CustomFloat!(10, 5,CustomFloatFlags.ieee)     z;     // Using the precision, exponent width and format flags
+    CustomFloat!(10, 5,CustomFloatFlags.ieee, 15) w;     // Using the precision, exponent width, format flags and exponent offset bias
+
+    // Use the 16-bit floats mostly like normal numbers
+    w = x*y - 1;
+
+    // Functions calls require conversion
+    z = sin(+x)           + cos(+y);                     // Use unary plus to concisely convert to a real
+    z = sin(x.get!float)  + cos(y.get!float);            // Or use get!T
+    z = sin(cast(float) x) + cos(cast(float) y);           // Or use cast(T) to explicitly convert
+
+    // Define a 8-bit custom float for storing probabilities
+    alias Probability = CustomFloat!(4, 4, CustomFloatFlags.ieee^CustomFloatFlags.probability^CustomFloatFlags.signed );
+    auto p = Probability(0.5);
+}
+
+/// ditto
+struct CustomFloat(uint             precision,  // fraction bits (23 for float)
+                   uint             exponentWidth,  // exponent bits (8 for float)  Exponent width
+                   CustomFloatFlags flags,
+                   uint             bias)
+if (((flags & flags.signed)  + precision + exponentWidth) % 8 == 0 &&
+    precision + exponentWidth > 0)
+{
+    import std.bitmanip : bitfields;
+    import std.meta : staticIndexOf;
+private:
+    // get the correct unsigned bitfield type to support > 32 bits
+    template uType(uint bits)
+    {
+        static if (bits <= size_t.sizeof*8)  alias uType = size_t;
+        else                                alias uType = ulong ;
+    }
+
+    // get the correct signed   bitfield type to support > 32 bits
+    template sType(uint bits)
+    {
+        static if (bits <= ptrdiff_t.sizeof*8-1) alias sType = ptrdiff_t;
+        else                                    alias sType = long;
+    }
+
+    alias T_sig = uType!precision;
+    alias T_exp = uType!exponentWidth;
+    alias T_signed_exp = sType!exponentWidth;
+
+    alias Flags = CustomFloatFlags;
+
+    // Facilitate converting numeric types to custom float
+    union ToBinary(F)
+        if (is(typeof(CustomFloatParams!(F.sizeof*8))) || is(F == real))
+    {
+        F set;
+
+        // If on Linux or Mac, where 80-bit reals are padded, ignore the
+        // padding.
+        import std.algorithm.comparison : min;
+        CustomFloat!(CustomFloatParams!(min(F.sizeof*8, 80))) get;
+
+        // Convert F to the correct binary type.
+        static typeof(get) opCall(F value)
+        {
+            ToBinary r;
+            r.set = value;
+            return r.get;
+        }
+        alias get this;
+    }
+
+    // Perform IEEE rounding with round to nearest detection
+    void roundedShift(T,U)(ref T sig, U shift)
+    {
+        if (sig << (T.sizeof*8 - shift) == cast(T) 1uL << (T.sizeof*8 - 1))
+        {
+            // round to even
+            sig >>= shift;
+            sig  += sig & 1;
+        }
+        else
+        {
+            sig >>= shift - 1;
+            sig  += sig & 1;
+            // Perform standard rounding
+            sig >>= 1;
+        }
+    }
+
+    // Convert the current value to signed exponent, normalized form
+    void toNormalized(T,U)(ref T sig, ref U exp)
+    {
+        sig = significand;
+        auto shift = (T.sizeof*8) - precision;
+        exp = exponent;
+        static if (flags&(Flags.infinity|Flags.nan))
+        {
+            // Handle inf or nan
+            if (exp == exponent_max)
+            {
+                exp = exp.max;
+                sig <<= shift;
+                static if (flags&Flags.storeNormalized)
+                {
+                    // Save inf/nan in denormalized format
+                    sig >>= 1;
+                    sig  += cast(T) 1uL << (T.sizeof*8 - 1);
+                }
+                return;
+            }
+        }
+        if ((~flags&Flags.storeNormalized) ||
+            // Convert denormalized form to normalized form
+            ((flags&Flags.allowDenorm) && exp == 0))
+        {
+            if (sig > 0)
+            {
+                import core.bitop : bsr;
+                auto shift2 = precision - bsr(sig);
+                exp  -= shift2-1;
+                shift += shift2;
+            }
+            else                                // value = 0.0
+            {
+                exp = exp.min;
+                return;
+            }
+        }
+        sig <<= shift;
+        exp -= bias;
+    }
+
+    // Set the current value from signed exponent, normalized form
+    void fromNormalized(T,U)(ref T sig, ref U exp)
+    {
+        auto shift = (T.sizeof*8) - precision;
+        if (exp == exp.max)
+        {
+            // infinity or nan
+            exp = exponent_max;
+            static if (flags & Flags.storeNormalized)
+                sig <<= 1;
+
+            // convert back to normalized form
+            static if (~flags & Flags.infinity)
+                // No infinity support?
+                assert(sig != 0, "Infinity floating point value assigned to a "
+                        ~ typeof(this).stringof ~ " (no infinity support).");
+
+            static if (~flags & Flags.nan)  // No NaN support?
+                assert(sig == 0, "NaN floating point value assigned to a " ~
+                        typeof(this).stringof ~ " (no nan support).");
+            sig >>= shift;
+            return;
+        }
+        if (exp == exp.min)     // 0.0
+        {
+             exp = 0;
+             sig = 0;
+             return;
+        }
+
+        exp += bias;
+        if (exp <= 0)
+        {
+            static if ((flags&Flags.allowDenorm) ||
+                       // Convert from normalized form to denormalized
+                       (~flags&Flags.storeNormalized))
+            {
+                shift += -exp;
+                roundedShift(sig,1);
+                sig   += cast(T) 1uL << (T.sizeof*8 - 1);
+                // Add the leading 1
+                exp    = 0;
+            }
+            else
+                assert((flags&Flags.storeNormalized) && exp == 0,
+                    "Underflow occured assigning to a " ~
+                    typeof(this).stringof ~ " (no denormal support).");
+        }
+        else
+        {
+            static if (~flags&Flags.storeNormalized)
+            {
+                // Convert from normalized form to denormalized
+                roundedShift(sig,1);
+                sig  += cast(T) 1uL << (T.sizeof*8 - 1);
+                // Add the leading 1
+            }
+        }
+
+        if (shift > 0)
+            roundedShift(sig,shift);
+        if (sig > significand_max)
+        {
+            // handle significand overflow (should only be 1 bit)
+            static if (~flags&Flags.storeNormalized)
+            {
+                sig >>= 1;
+            }
+            else
+                sig &= significand_max;
+            exp++;
+        }
+        static if ((flags&Flags.allowDenormZeroOnly)==Flags.allowDenormZeroOnly)
+        {
+            // disallow non-zero denormals
+            if (exp == 0)
+            {
+                sig <<= 1;
+                if (sig > significand_max && (sig&significand_max) > 0)
+                    // Check and round to even
+                    exp++;
+                sig = 0;
+            }
+        }
+
+        if (exp >= exponent_max)
+        {
+            static if (flags&(Flags.infinity|Flags.nan))
+            {
+                sig         = 0;
+                exp         = exponent_max;
+                static if (~flags&(Flags.infinity))
+                    assert(0, "Overflow occured assigning to a " ~
+                        typeof(this).stringof ~ " (no infinity support).");
+            }
+            else
+                assert(exp == exponent_max, "Overflow occured assigning to a "
+                    ~ typeof(this).stringof ~ " (no infinity support).");
+        }
+    }
+
+public:
+    static if (precision == 64) // CustomFloat!80 support hack
+    {
+        ulong significand;
+        enum ulong significand_max = ulong.max;
+        mixin(bitfields!(
+            T_exp , "exponent", exponentWidth,
+            bool  , "sign"    , flags & flags.signed ));
+    }
+    else
+    {
+        mixin(bitfields!(
+            T_sig, "significand", precision,
+            T_exp, "exponent"   , exponentWidth,
+            bool , "sign"       , flags & flags.signed ));
+    }
+
+    /// Returns: infinity value
+    static if (flags & Flags.infinity)
+        static @property CustomFloat infinity()
+        {
+            CustomFloat value;
+            static if (flags & Flags.signed)
+            value.sign          = 0;
+            value.significand   = 0;
+            value.exponent      = exponent_max;
+            return value;
+        }
+
+    /// Returns: NaN value
+    static if (flags & Flags.nan)
+        static @property CustomFloat nan()
+        {
+            CustomFloat value;
+            static if (flags & Flags.signed)
+            value.sign          = 0;
+            value.significand   = cast(typeof(significand_max)) 1L << (precision-1);
+            value.exponent      = exponent_max;
+            return value;
+        }
+
+    /// Returns: number of decimal digits of precision
+    static @property size_t dig()
+    {
+        auto shiftcnt =  precision - ((flags&Flags.storeNormalized) != 0);
+        immutable x = (shiftcnt == 64) ? 0 : 1uL << shiftcnt;
+        return cast(size_t) log10(x);
+    }
+
+    /// Returns: smallest increment to the value 1
+    static @property CustomFloat epsilon()
+    {
+        CustomFloat value;
+        static if (flags & Flags.signed)
+        value.sign       = 0;
+        T_signed_exp exp = -precision;
+        T_sig        sig = 0;
+
+        value.fromNormalized(sig,exp);
+        if (exp == 0 && sig == 0) // underflowed to zero
+        {
+            static if ((flags&Flags.allowDenorm) ||
+                       (~flags&Flags.storeNormalized))
+                sig = 1;
+            else
+                sig = cast(T) 1uL << (precision - 1);
+        }
+        value.exponent     = cast(value.T_exp) exp;
+        value.significand  = cast(value.T_sig) sig;
+        return value;
+    }
+
+    /// the number of bits in mantissa
+    enum mant_dig = precision + ((flags&Flags.storeNormalized) != 0);
+
+    /// Returns: maximum int value such that 10<sup>max_10_exp</sup> is representable
+    static @property int max_10_exp(){ return cast(int) log10( +max ); }
+
+    /// maximum int value such that 2<sup>max_exp-1</sup> is representable
+    enum max_exp = exponent_max-bias+((~flags&(Flags.infinity|flags.nan))!=0);
+
+    /// Returns: minimum int value such that 10<sup>min_10_exp</sup> is representable
+    static @property int min_10_exp(){ return cast(int) log10( +min_normal ); }
+
+    /// minimum int value such that 2<sup>min_exp-1</sup> is representable as a normalized value
+    enum min_exp = cast(T_signed_exp)-bias +1+ ((flags&Flags.allowDenorm)!=0);
+
+    /// Returns: largest representable value that's not infinity
+    static @property CustomFloat max()
+    {
+        CustomFloat value;
+        static if (flags & Flags.signed)
+        value.sign        = 0;
+        value.exponent    = exponent_max - ((flags&(flags.infinity|flags.nan)) != 0);
+        value.significand = significand_max;
+        return value;
+    }
+
+    /// Returns: smallest representable normalized value that's not 0
+    static @property CustomFloat min_normal() {
+        CustomFloat value;
+        static if (flags & Flags.signed)
+        value.sign        = 0;
+        value.exponent    = 1;
+        static if (flags&Flags.storeNormalized)
+            value.significand = 0;
+        else
+            value.significand = cast(T_sig) 1uL << (precision - 1);
+        return value;
+    }
+
+    /// Returns: real part
+    @property CustomFloat re() { return this; }
+
+    /// Returns: imaginary part
+    static @property CustomFloat im() { return CustomFloat(0.0f); }
+
+    /// Initialize from any $(D real) compatible type.
+    this(F)(F input) if (__traits(compiles, cast(real) input ))
+    {
+        this = input;
+    }
+
+    /// Self assignment
+    void opAssign(F:CustomFloat)(F input)
+    {
+        static if (flags & Flags.signed)
+        sign        = input.sign;
+        exponent    = input.exponent;
+        significand = input.significand;
+    }
+
+    /// Assigns from any $(D real) compatible type.
+    void opAssign(F)(F input)
+        if (__traits(compiles, cast(real) input))
+    {
+        import std.conv : text;
+
+        static if (staticIndexOf!(Unqual!F, float, double, real) >= 0)
+            auto value = ToBinary!(Unqual!F)(input);
+        else
+            auto value = ToBinary!(real    )(input);
+
+        // Assign the sign bit
+        static if (~flags & Flags.signed)
+            assert((!value.sign) ^ ((flags&flags.negativeUnsigned) > 0),
+                "Incorrectly signed floating point value assigned to a " ~
+                typeof(this).stringof ~ " (no sign support).");
+        else
+            sign = value.sign;
+
+        CommonType!(T_signed_exp ,value.T_signed_exp) exp = value.exponent;
+        CommonType!(T_sig,        value.T_sig       ) sig = value.significand;
+
+        value.toNormalized(sig,exp);
+        fromNormalized(sig,exp);
+
+        assert(exp <= exponent_max,    text(typeof(this).stringof ~
+            " exponent too large: "   ,exp," > ",exponent_max,   "\t",input,"\t",sig));
+        assert(sig <= significand_max, text(typeof(this).stringof ~
+            " significand too large: ",sig," > ",significand_max,
+            "\t",input,"\t",exp," ",exponent_max));
+        exponent    = cast(T_exp) exp;
+        significand = cast(T_sig) sig;
+    }
+
+    /// Fetches the stored value either as a $(D float), $(D double) or $(D real).
+    @property F get(F)()
+        if (staticIndexOf!(Unqual!F, float, double, real) >= 0)
+    {
+        import std.conv : text;
+
+        ToBinary!F result;
+
+        static if (flags&Flags.signed)
+            result.sign = sign;
+        else
+            result.sign = (flags&flags.negativeUnsigned) > 0;
+
+        CommonType!(T_signed_exp ,result.get.T_signed_exp ) exp = exponent; // Assign the exponent and fraction
+        CommonType!(T_sig,        result.get.T_sig        ) sig = significand;
+
+        toNormalized(sig,exp);
+        result.fromNormalized(sig,exp);
+        assert(exp <= result.exponent_max,    text("get exponent too large: "   ,exp," > ",result.exponent_max) );
+        assert(sig <= result.significand_max, text("get significand too large: ",sig," > ",result.significand_max) );
+        result.exponent     = cast(result.get.T_exp) exp;
+        result.significand  = cast(result.get.T_sig) sig;
+        return result.set;
+    }
+
+    ///ditto
+    T opCast(T)() if (__traits(compiles, get!T )) { return get!T; }
+
+    /// Convert the CustomFloat to a real and perform the relavent operator on the result
+    real opUnary(string op)()
+        if (__traits(compiles, mixin(op~`(get!real)`)) || op=="++" || op=="--")
+    {
+        static if (op=="++" || op=="--")
+        {
+            auto result = get!real;
+            this = mixin(op~`result`);
+            return result;
+        }
+        else
+            return mixin(op~`get!real`);
+    }
+
+    /// ditto
+    real opBinary(string op,T)(T b)
+        if (__traits(compiles, mixin(`get!real`~op~`b`)))
+    {
+        return mixin(`get!real`~op~`b`);
+    }
+
+    /// ditto
+    real opBinaryRight(string op,T)(T a)
+        if ( __traits(compiles, mixin(`a`~op~`get!real`)) &&
+            !__traits(compiles, mixin(`get!real`~op~`b`)))
+    {
+        return mixin(`a`~op~`get!real`);
+    }
+
+    /// ditto
+    int opCmp(T)(auto ref T b)
+        if (__traits(compiles, cast(real) b))
+    {
+        auto x = get!real;
+        auto y = cast(real) b;
+        return  (x >= y)-(x <= y);
+    }
+
+    /// ditto
+    void opOpAssign(string op, T)(auto ref T b)
+        if (__traits(compiles, mixin(`get!real`~op~`cast(real) b`)))
+    {
+        return mixin(`this = this `~op~` cast(real) b`);
+    }
+
+    /// ditto
+    template toString()
+    {
+        import std.format : FormatSpec, formatValue;
+        // Needs to be a template because of DMD @@BUG@@ 13737.
+        void toString()(scope void delegate(const(char)[]) sink, FormatSpec!char fmt)
+        {
+            sink.formatValue(get!real, fmt);
+        }
+    }
+}
+
+@safe unittest
+{
+    import std.meta;
+    alias FPTypes =
+        AliasSeq!(
+            CustomFloat!(5, 10),
+            CustomFloat!(5, 11, CustomFloatFlags.ieee ^ CustomFloatFlags.signed),
+            CustomFloat!(1, 15, CustomFloatFlags.ieee ^ CustomFloatFlags.signed),
+            CustomFloat!(4, 3, CustomFloatFlags.ieee | CustomFloatFlags.probability ^ CustomFloatFlags.signed)
+        );
+
+    foreach (F; FPTypes)
+    {
+        auto x = F(0.125);
+        assert(x.get!float == 0.125F);
+        assert(x.get!double == 0.125);
+
+        x -= 0.0625;
+        assert(x.get!float == 0.0625F);
+        assert(x.get!double == 0.0625);
+
+        x *= 2;
+        assert(x.get!float == 0.125F);
+        assert(x.get!double == 0.125);
+
+        x /= 4;
+        assert(x.get!float == 0.03125);
+        assert(x.get!double == 0.03125);
+
+        x = 0.5;
+        x ^^= 4;
+        assert(x.get!float == 1 / 16.0F);
+        assert(x.get!double == 1 / 16.0);
+    }
+}
+
+@system unittest
+{
+    // @system due to to!string(CustomFloat)
+    import std.conv;
+    CustomFloat!(5, 10) y = CustomFloat!(5, 10)(0.125);
+    assert(y.to!string == "0.125");
+}
+
+/**
+Defines the fastest type to use when storing temporaries of a
+calculation intended to ultimately yield a result of type $(D F)
+(where $(D F) must be one of $(D float), $(D double), or $(D
+real)). When doing a multi-step computation, you may want to store
+intermediate results as $(D FPTemporary!F).
+
+The necessity of $(D FPTemporary) stems from the optimized
+floating-point operations and registers present in virtually all
+processors. When adding numbers in the example above, the addition may
+in fact be done in $(D real) precision internally. In that case,
+storing the intermediate $(D result) in $(D double format) is not only
+less precise, it is also (surprisingly) slower, because a conversion
+from $(D real) to $(D double) is performed every pass through the
+loop. This being a lose-lose situation, $(D FPTemporary!F) has been
+defined as the $(I fastest) type to use for calculations at precision
+$(D F). There is no need to define a type for the $(I most accurate)
+calculations, as that is always $(D real).
+
+Finally, there is no guarantee that using $(D FPTemporary!F) will
+always be fastest, as the speed of floating-point calculations depends
+on very many factors.
+ */
+template FPTemporary(F)
+if (isFloatingPoint!F)
+{
+    version (X86)
+        alias FPTemporary = real;
+    else
+        alias FPTemporary = Unqual!F;
+}
+
+///
+@safe unittest
+{
+    import std.math : approxEqual;
+
+    // Average numbers in an array
+    double avg(in double[] a)
+    {
+        if (a.length == 0) return 0;
+        FPTemporary!double result = 0;
+        foreach (e; a) result += e;
+        return result / a.length;
+    }
+
+    auto a = [1.0, 2.0, 3.0];
+    assert(approxEqual(avg(a), 2));
+}
+
+/**
+Implements the $(HTTP tinyurl.com/2zb9yr, secant method) for finding a
+root of the function $(D fun) starting from points $(D [xn_1, x_n])
+(ideally close to the root). $(D Num) may be $(D float), $(D double),
+or $(D real).
+*/
+template secantMethod(alias fun)
+{
+    import std.functional : unaryFun;
+    Num secantMethod(Num)(Num xn_1, Num xn)
+    {
+        auto fxn = unaryFun!(fun)(xn_1), d = xn_1 - xn;
+        typeof(fxn) fxn_1;
+
+        xn = xn_1;
+        while (!approxEqual(d, 0) && isFinite(d))
+        {
+            xn_1 = xn;
+            xn -= d;
+            fxn_1 = fxn;
+            fxn = unaryFun!(fun)(xn);
+            d *= -fxn / (fxn - fxn_1);
+        }
+        return xn;
+    }
+}
+
+///
+@safe unittest
+{
+    import std.math : approxEqual, cos;
+
+    float f(float x)
+    {
+        return cos(x) - x*x*x;
+    }
+    auto x = secantMethod!(f)(0f, 1f);
+    assert(approxEqual(x, 0.865474));
+}
+
+@system unittest
+{
+    // @system because of __gshared stderr
+    scope(failure) stderr.writeln("Failure testing secantMethod");
+    float f(float x)
+    {
+        return cos(x) - x*x*x;
+    }
+    immutable x = secantMethod!(f)(0f, 1f);
+    assert(approxEqual(x, 0.865474));
+    auto d = &f;
+    immutable y = secantMethod!(d)(0f, 1f);
+    assert(approxEqual(y, 0.865474));
+}
+
+
+/**
+ * Return true if a and b have opposite sign.
+ */
+private bool oppositeSigns(T1, T2)(T1 a, T2 b)
+{
+    return signbit(a) != signbit(b);
+}
+
+public:
+
+/**  Find a real root of a real function f(x) via bracketing.
+ *
+ * Given a function `f` and a range `[a .. b]` such that `f(a)`
+ * and `f(b)` have opposite signs or at least one of them equals ±0,
+ * returns the value of `x` in
+ * the range which is closest to a root of `f(x)`.  If `f(x)`
+ * has more than one root in the range, one will be chosen
+ * arbitrarily.  If `f(x)` returns NaN, NaN will be returned;
+ * otherwise, this algorithm is guaranteed to succeed.
+ *
+ * Uses an algorithm based on TOMS748, which uses inverse cubic
+ * interpolation whenever possible, otherwise reverting to parabolic
+ * or secant interpolation. Compared to TOMS748, this implementation
+ * improves worst-case performance by a factor of more than 100, and
+ * typical performance by a factor of 2. For 80-bit reals, most
+ * problems require 8 to 15 calls to `f(x)` to achieve full machine
+ * precision. The worst-case performance (pathological cases) is
+ * approximately twice the number of bits.
+ *
+ * References: "On Enclosing Simple Roots of Nonlinear Equations",
+ * G. Alefeld, F.A. Potra, Yixun Shi, Mathematics of Computation 61,
+ * pp733-744 (1993).  Fortran code available from $(HTTP
+ * www.netlib.org,www.netlib.org) as algorithm TOMS478.
+ *
+ */
+T findRoot(T, DF, DT)(scope DF f, in T a, in T b,
+    scope DT tolerance) //= (T a, T b) => false)
+if (
+    isFloatingPoint!T &&
+    is(typeof(tolerance(T.init, T.init)) : bool) &&
+    is(typeof(f(T.init)) == R, R) && isFloatingPoint!R
+    )
+{
+    immutable fa = f(a);
+    if (fa == 0)
+        return a;
+    immutable fb = f(b);
+    if (fb == 0)
+        return b;
+    immutable r = findRoot(f, a, b, fa, fb, tolerance);
+    // Return the first value if it is smaller or NaN
+    return !(fabs(r[2]) > fabs(r[3])) ? r[0] : r[1];
+}
+
+///ditto
+T findRoot(T, DF)(scope DF f, in T a, in T b)
+{
+    return findRoot(f, a, b, (T a, T b) => false);
+}
+
+/** Find root of a real function f(x) by bracketing, allowing the
+ * termination condition to be specified.
+ *
+ * Params:
+ *
+ * f = Function to be analyzed
+ *
+ * ax = Left bound of initial range of `f` known to contain the
+ * root.
+ *
+ * bx = Right bound of initial range of `f` known to contain the
+ * root.
+ *
+ * fax = Value of $(D f(ax)).
+ *
+ * fbx = Value of $(D f(bx)). $(D fax) and $(D fbx) should have opposite signs.
+ * ($(D f(ax)) and $(D f(bx)) are commonly known in advance.)
+ *
+ *
+ * tolerance = Defines an early termination condition. Receives the
+ *             current upper and lower bounds on the root. The
+ *             delegate must return $(D true) when these bounds are
+ *             acceptable. If this function always returns $(D false),
+ *             full machine precision will be achieved.
+ *
+ * Returns:
+ *
+ * A tuple consisting of two ranges. The first two elements are the
+ * range (in `x`) of the root, while the second pair of elements
+ * are the corresponding function values at those points. If an exact
+ * root was found, both of the first two elements will contain the
+ * root, and the second pair of elements will be 0.
+ */
+Tuple!(T, T, R, R) findRoot(T, R, DF, DT)(scope DF f, in T ax, in T bx, in R fax, in R fbx,
+    scope DT tolerance) // = (T a, T b) => false)
+if (
+    isFloatingPoint!T &&
+    is(typeof(tolerance(T.init, T.init)) : bool) &&
+    is(typeof(f(T.init)) == R) && isFloatingPoint!R
+    )
+in
+{
+    assert(!ax.isNaN() && !bx.isNaN(), "Limits must not be NaN");
+    assert(signbit(fax) != signbit(fbx), "Parameters must bracket the root.");
+}
+body
+{
+    // Author: Don Clugston. This code is (heavily) modified from TOMS748
+    // (www.netlib.org).  The changes to improve the worst-cast performance are
+    // entirely original.
+
+    T a, b, d;  // [a .. b] is our current bracket. d is the third best guess.
+    R fa, fb, fd; // Values of f at a, b, d.
+    bool done = false; // Has a root been found?
+
+    // Allow ax and bx to be provided in reverse order
+    if (ax <= bx)
+    {
+        a = ax; fa = fax;
+        b = bx; fb = fbx;
+    }
+    else
+    {
+        a = bx; fa = fbx;
+        b = ax; fb = fax;
+    }
+
+    // Test the function at point c; update brackets accordingly
+    void bracket(T c)
+    {
+        R fc = f(c);
+        if (fc == 0 || fc.isNaN()) // Exact solution, or NaN
+        {
+            a = c;
+            fa = fc;
+            d = c;
+            fd = fc;
+            done = true;
+            return;
+        }
+
+        // Determine new enclosing interval
+        if (signbit(fa) != signbit(fc))
+        {
+            d = b;
+            fd = fb;
+            b = c;
+            fb = fc;
+        }
+        else
+        {
+            d = a;
+            fd = fa;
+            a = c;
+            fa = fc;
+        }
+    }
+
+   /* Perform a secant interpolation. If the result would lie on a or b, or if
+     a and b differ so wildly in magnitude that the result would be meaningless,
+     perform a bisection instead.
+    */
+    static T secant_interpolate(T a, T b, R fa, R fb)
+    {
+        if (( ((a - b) == a) && b != 0) || (a != 0 && ((b - a) == b)))
+        {
+            // Catastrophic cancellation
+            if (a == 0)
+                a = copysign(T(0), b);
+            else if (b == 0)
+                b = copysign(T(0), a);
+            else if (signbit(a) != signbit(b))
+                return 0;
+            T c = ieeeMean(a, b);
+            return c;
+        }
+        // avoid overflow
+        if (b - a > T.max)
+            return b / 2 + a / 2;
+        if (fb - fa > R.max)
+            return a - (b - a) / 2;
+        T c = a - (fa / (fb - fa)) * (b - a);
+        if (c == a || c == b)
+            return (a + b) / 2;
+        return c;
+    }
+
+    /* Uses 'numsteps' newton steps to approximate the zero in [a .. b] of the
+       quadratic polynomial interpolating f(x) at a, b, and d.
+       Returns:
+         The approximate zero in [a .. b] of the quadratic polynomial.
+    */
+    T newtonQuadratic(int numsteps)
+    {
+        // Find the coefficients of the quadratic polynomial.
+        immutable T a0 = fa;
+        immutable T a1 = (fb - fa)/(b - a);
+        immutable T a2 = ((fd - fb)/(d - b) - a1)/(d - a);
+
+        // Determine the starting point of newton steps.
+        T c = oppositeSigns(a2, fa) ? a  : b;
+
+        // start the safeguarded newton steps.
+        foreach (int i; 0 .. numsteps)
+        {
+            immutable T pc = a0 + (a1 + a2 * (c - b))*(c - a);
+            immutable T pdc = a1 + a2*((2 * c) - (a + b));
+            if (pdc == 0)
+                return a - a0 / a1;
+            else
+                c = c - pc / pdc;
+        }
+        return c;
+    }
+
+    // On the first iteration we take a secant step:
+    if (fa == 0 || fa.isNaN())
+    {
+        done = true;
+        b = a;
+        fb = fa;
+    }
+    else if (fb == 0 || fb.isNaN())
+    {
+        done = true;
+        a = b;
+        fa = fb;
+    }
+    else
+    {
+        bracket(secant_interpolate(a, b, fa, fb));
+    }
+
+    // Starting with the second iteration, higher-order interpolation can
+    // be used.
+    int itnum = 1;   // Iteration number
+    int baditer = 1; // Num bisections to take if an iteration is bad.
+    T c, e;  // e is our fourth best guess
+    R fe;
+
+whileloop:
+    while (!done && (b != nextUp(a)) && !tolerance(a, b))
+    {
+        T a0 = a, b0 = b; // record the brackets
+
+        // Do two higher-order (cubic or parabolic) interpolation steps.
+        foreach (int QQ; 0 .. 2)
+        {
+            // Cubic inverse interpolation requires that
+            // all four function values fa, fb, fd, and fe are distinct;
+            // otherwise use quadratic interpolation.
+            bool distinct = (fa != fb) && (fa != fd) && (fa != fe)
+                         && (fb != fd) && (fb != fe) && (fd != fe);
+            // The first time, cubic interpolation is impossible.
+            if (itnum<2) distinct = false;
+            bool ok = distinct;
+            if (distinct)
+            {
+                // Cubic inverse interpolation of f(x) at a, b, d, and e
+                immutable q11 = (d - e) * fd / (fe - fd);
+                immutable q21 = (b - d) * fb / (fd - fb);
+                immutable q31 = (a - b) * fa / (fb - fa);
+                immutable d21 = (b - d) * fd / (fd - fb);
+                immutable d31 = (a - b) * fb / (fb - fa);
+
+                immutable q22 = (d21 - q11) * fb / (fe - fb);
+                immutable q32 = (d31 - q21) * fa / (fd - fa);
+                immutable d32 = (d31 - q21) * fd / (fd - fa);
+                immutable q33 = (d32 - q22) * fa / (fe - fa);
+                c = a + (q31 + q32 + q33);
+                if (c.isNaN() || (c <= a) || (c >= b))
+                {
+                    // DAC: If the interpolation predicts a or b, it's
+                    // probable that it's the actual root. Only allow this if
+                    // we're already close to the root.
+                    if (c == a && a - b != a)
+                    {
+                        c = nextUp(a);
+                    }
+                    else if (c == b && a - b != -b)
+                    {
+                        c = nextDown(b);
+                    }
+                    else
+                    {
+                        ok = false;
+                    }
+                }
+            }
+            if (!ok)
+            {
+                // DAC: Alefeld doesn't explain why the number of newton steps
+                // should vary.
+                c = newtonQuadratic(distinct ? 3 : 2);
+                if (c.isNaN() || (c <= a) || (c >= b))
+                {
+                    // Failure, try a secant step:
+                    c = secant_interpolate(a, b, fa, fb);
+                }
+            }
+            ++itnum;
+            e = d;
+            fe = fd;
+            bracket(c);
+            if (done || ( b == nextUp(a)) || tolerance(a, b))
+                break whileloop;
+            if (itnum == 2)
+                continue whileloop;
+        }
+
+        // Now we take a double-length secant step:
+        T u;
+        R fu;
+        if (fabs(fa) < fabs(fb))
+        {
+            u = a;
+            fu = fa;
+        }
+        else
+        {
+            u = b;
+            fu = fb;
+        }
+        c = u - 2 * (fu / (fb - fa)) * (b - a);
+
+        // DAC: If the secant predicts a value equal to an endpoint, it's
+        // probably false.
+        if (c == a || c == b || c.isNaN() || fabs(c - u) > (b - a) / 2)
+        {
+            if ((a-b) == a || (b-a) == b)
+            {
+                if ((a>0 && b<0) || (a<0 && b>0))
+                    c = 0;
+                else
+                {
+                    if (a == 0)
+                        c = ieeeMean(copysign(T(0), b), b);
+                    else if (b == 0)
+                        c = ieeeMean(copysign(T(0), a), a);
+                    else
+                        c = ieeeMean(a, b);
+                }
+            }
+            else
+            {
+                c = a + (b - a) / 2;
+            }
+        }
+        e = d;
+        fe = fd;
+        bracket(c);
+        if (done || (b == nextUp(a)) || tolerance(a, b))
+            break;
+
+        // IMPROVE THE WORST-CASE PERFORMANCE
+        // We must ensure that the bounds reduce by a factor of 2
+        // in binary space! every iteration. If we haven't achieved this
+        // yet, or if we don't yet know what the exponent is,
+        // perform a binary chop.
+
+        if ((a == 0 || b == 0 ||
+            (fabs(a) >= T(0.5) * fabs(b) && fabs(b) >= T(0.5) * fabs(a)))
+            &&  (b - a) < T(0.25) * (b0 - a0))
+        {
+            baditer = 1;
+            continue;
+        }
+
+        // DAC: If this happens on consecutive iterations, we probably have a
+        // pathological function. Perform a number of bisections equal to the
+        // total number of consecutive bad iterations.
+
+        if ((b - a) < T(0.25) * (b0 - a0))
+            baditer = 1;
+        foreach (int QQ; 0 .. baditer)
+        {
+            e = d;
+            fe = fd;
+
+            T w;
+            if ((a>0 && b<0) || (a<0 && b>0))
+                w = 0;
+            else
+            {
+                T usea = a;
+                T useb = b;
+                if (a == 0)
+                    usea = copysign(T(0), b);
+                else if (b == 0)
+                    useb = copysign(T(0), a);
+                w = ieeeMean(usea, useb);
+            }
+            bracket(w);
+        }
+        ++baditer;
+    }
+    return Tuple!(T, T, R, R)(a, b, fa, fb);
+}
+
+///ditto
+Tuple!(T, T, R, R) findRoot(T, R, DF)(scope DF f, in T ax, in T bx, in R fax, in R fbx)
+{
+    return findRoot(f, ax, bx, fax, fbx, (T a, T b) => false);
+}
+
+///ditto
+T findRoot(T, R)(scope R delegate(T) f, in T a, in T b,
+    scope bool delegate(T lo, T hi) tolerance = (T a, T b) => false)
+{
+    return findRoot!(T, R delegate(T), bool delegate(T lo, T hi))(f, a, b, tolerance);
+}
+
+@safe nothrow unittest
+{
+    int numProblems = 0;
+    int numCalls;
+
+    void testFindRoot(real delegate(real) @nogc @safe nothrow pure f , real x1, real x2) @nogc @safe nothrow pure
+    {
+        //numCalls=0;
+        //++numProblems;
+        assert(!x1.isNaN() && !x2.isNaN());
+        assert(signbit(x1) != signbit(x2));
+        auto result = findRoot(f, x1, x2, f(x1), f(x2),
+          (real lo, real hi) { return false; });
+
+        auto flo = f(result[0]);
+        auto fhi = f(result[1]);
+        if (flo != 0)
+        {
+            assert(oppositeSigns(flo, fhi));
+        }
+    }
+
+    // Test functions
+    real cubicfn(real x) @nogc @safe nothrow pure
+    {
+        //++numCalls;
+        if (x>float.max)
+            x = float.max;
+        if (x<-double.max)
+            x = -double.max;
+        // This has a single real root at -59.286543284815
+        return 0.386*x*x*x + 23*x*x + 15.7*x + 525.2;
+    }
+    // Test a function with more than one root.
+    real multisine(real x) { ++numCalls; return sin(x); }
+    //testFindRoot( &multisine, 6, 90);
+    //testFindRoot(&cubicfn, -100, 100);
+    //testFindRoot( &cubicfn, -double.max, real.max);
+
+
+/* Tests from the paper:
+ * "On Enclosing Simple Roots of Nonlinear Equations", G. Alefeld, F.A. Potra,
+ *   Yixun Shi, Mathematics of Computation 61, pp733-744 (1993).
+ */
+    // Parameters common to many alefeld tests.
+    int n;
+    real ale_a, ale_b;
+
+    int powercalls = 0;
+
+    real power(real x)
+    {
+        ++powercalls;
+        ++numCalls;
+        return pow(x, n) + double.min_normal;
+    }
+    int [] power_nvals = [3, 5, 7, 9, 19, 25];
+    // Alefeld paper states that pow(x,n) is a very poor case, where bisection
+    // outperforms his method, and gives total numcalls =
+    // 921 for bisection (2.4 calls per bit), 1830 for Alefeld (4.76/bit),
+    // 2624 for brent (6.8/bit)
+    // ... but that is for double, not real80.
+    // This poor performance seems mainly due to catastrophic cancellation,
+    // which is avoided here by the use of ieeeMean().
+    // I get: 231 (0.48/bit).
+    // IE this is 10X faster in Alefeld's worst case
+    numProblems=0;
+    foreach (k; power_nvals)
+    {
+        n = k;
+        //testFindRoot(&power, -1, 10);
+    }
+
+    int powerProblems = numProblems;
+
+    // Tests from Alefeld paper
+
+    int [9] alefeldSums;
+    real alefeld0(real x)
+    {
+        ++alefeldSums[0];
+        ++numCalls;
+        real q =  sin(x) - x/2;
+        for (int i=1; i<20; ++i)
+            q+=(2*i-5.0)*(2*i-5.0)/((x-i*i)*(x-i*i)*(x-i*i));
+        return q;
+    }
+    real alefeld1(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[1];
+        return ale_a*x + exp(ale_b * x);
+    }
+    real alefeld2(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[2];
+        return pow(x, n) - ale_a;
+    }
+    real alefeld3(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[3];
+        return (1.0 +pow(1.0L-n, 2))*x - pow(1.0L-n*x, 2);
+    }
+    real alefeld4(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[4];
+        return x*x - pow(1-x, n);
+    }
+    real alefeld5(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[5];
+        return (1+pow(1.0L-n, 4))*x - pow(1.0L-n*x, 4);
+    }
+    real alefeld6(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[6];
+        return exp(-n*x)*(x-1.01L) + pow(x, n);
+    }
+    real alefeld7(real x)
+    {
+        ++numCalls;
+        ++alefeldSums[7];
+        return (n*x-1)/((n-1)*x);
+    }
+
+    numProblems=0;
+    //testFindRoot(&alefeld0, PI_2, PI);
+    for (n=1; n <= 10; ++n)
+    {
+        //testFindRoot(&alefeld0, n*n+1e-9L, (n+1)*(n+1)-1e-9L);
+    }
+    ale_a = -40; ale_b = -1;
+    //testFindRoot(&alefeld1, -9, 31);
+    ale_a = -100; ale_b = -2;
+    //testFindRoot(&alefeld1, -9, 31);
+    ale_a = -200; ale_b = -3;
+    //testFindRoot(&alefeld1, -9, 31);
+    int [] nvals_3 = [1, 2, 5, 10, 15, 20];
+    int [] nvals_5 = [1, 2, 4, 5, 8, 15, 20];
+    int [] nvals_6 = [1, 5, 10, 15, 20];
+    int [] nvals_7 = [2, 5, 15, 20];
+
+    for (int i=4; i<12; i+=2)
+    {
+       n = i;
+       ale_a = 0.2;
+       //testFindRoot(&alefeld2, 0, 5);
+       ale_a=1;
+       //testFindRoot(&alefeld2, 0.95, 4.05);
+       //testFindRoot(&alefeld2, 0, 1.5);
+    }
+    foreach (i; nvals_3)
+    {
+        n=i;
+        //testFindRoot(&alefeld3, 0, 1);
+    }
+    foreach (i; nvals_3)
+    {
+        n=i;
+        //testFindRoot(&alefeld4, 0, 1);
+    }
+    foreach (i; nvals_5)
+    {
+        n=i;
+        //testFindRoot(&alefeld5, 0, 1);
+    }
+    foreach (i; nvals_6)
+    {
+        n=i;
+        //testFindRoot(&alefeld6, 0, 1);
+    }
+    foreach (i; nvals_7)
+    {
+        n=i;
+        //testFindRoot(&alefeld7, 0.01L, 1);
+    }
+    real worstcase(real x)
+    {
+        ++numCalls;
+        return x<0.3*real.max? -0.999e-3 : 1.0;
+    }
+    //testFindRoot(&worstcase, -real.max, real.max);
+
+    // just check that the double + float cases compile
+    //findRoot((double x){ return 0.0; }, -double.max, double.max);
+    //findRoot((float x){ return 0.0f; }, -float.max, float.max);
+
+/*
+   int grandtotal=0;
+   foreach (calls; alefeldSums)
+   {
+       grandtotal+=calls;
+   }
+   grandtotal-=2*numProblems;
+   printf("\nALEFELD TOTAL = %d avg = %f (alefeld avg=19.3 for double)\n",
+   grandtotal, (1.0*grandtotal)/numProblems);
+   powercalls -= 2*powerProblems;
+   printf("POWER TOTAL = %d avg = %f ", powercalls,
+        (1.0*powercalls)/powerProblems);
+*/
+    //Issue 14231
+    auto xp = findRoot((float x) => x, 0f, 1f);
+    auto xn = findRoot((float x) => x, -1f, -0f);
+}
+
+//regression control
+@system unittest
+{
+    // @system due to the case in the 2nd line
+    static assert(__traits(compiles, findRoot((float x)=>cast(real) x, float.init, float.init)));
+    static assert(__traits(compiles, findRoot!real((x)=>cast(double) x, real.init, real.init)));
+    static assert(__traits(compiles, findRoot((real x)=>cast(double) x, real.init, real.init)));
+}
+
+/++
+Find a real minimum of a real function `f(x)` via bracketing.
+Given a function `f` and a range `(ax .. bx)`,
+returns the value of `x` in the range which is closest to a minimum of `f(x)`.
+`f` is never evaluted at the endpoints of `ax` and `bx`.
+If `f(x)` has more than one minimum in the range, one will be chosen arbitrarily.
+If `f(x)` returns NaN or -Infinity, `(x, f(x), NaN)` will be returned;
+otherwise, this algorithm is guaranteed to succeed.
+
+Params:
+    f = Function to be analyzed
+    ax = Left bound of initial range of f known to contain the minimum.
+    bx = Right bound of initial range of f known to contain the minimum.
+    relTolerance = Relative tolerance.
+    absTolerance = Absolute tolerance.
+
+Preconditions:
+    `ax` and `bx` shall be finite reals. $(BR)
+    $(D relTolerance) shall be normal positive real. $(BR)
+    $(D absTolerance) shall be normal positive real no less then $(D T.epsilon*2).
+
+Returns:
+    A tuple consisting of `x`, `y = f(x)` and `error = 3 * (absTolerance * fabs(x) + relTolerance)`.
+
+    The method used is a combination of golden section search and
+successive parabolic interpolation. Convergence is never much slower
+than that for a Fibonacci search.
+
+References:
+    "Algorithms for Minimization without Derivatives", Richard Brent, Prentice-Hall, Inc. (1973)
+
+See_Also: $(LREF findRoot), $(REF isNormal, std,math)
++/
+Tuple!(T, "x", Unqual!(ReturnType!DF), "y", T, "error")
+findLocalMin(T, DF)(
+        scope DF f,
+        in T ax,
+        in T bx,
+        in T relTolerance = sqrt(T.epsilon),
+        in T absTolerance = sqrt(T.epsilon),
+        )
+if (isFloatingPoint!T
+    && __traits(compiles, {T _ = DF.init(T.init);}))
+in
+{
+    assert(isFinite(ax), "ax is not finite");
+    assert(isFinite(bx), "bx is not finite");
+    assert(isNormal(relTolerance), "relTolerance is not normal floating point number");
+    assert(isNormal(absTolerance), "absTolerance is not normal floating point number");
+    assert(relTolerance >= 0, "absTolerance is not positive");
+    assert(absTolerance >= T.epsilon*2, "absTolerance is not greater then `2*T.epsilon`");
+}
+out (result)
+{
+    assert(isFinite(result.x));
+}
+body
+{
+    alias R = Unqual!(CommonType!(ReturnType!DF, T));
+    // c is the squared inverse of the golden ratio
+    // (3 - sqrt(5))/2
+    // Value obtained from Wolfram Alpha.
+    enum T c = 0x0.61c8864680b583ea0c633f9fa31237p+0L;
+    enum T cm1 = 0x0.9e3779b97f4a7c15f39cc0605cedc8p+0L;
+    R tolerance;
+    T a = ax > bx ? bx : ax;
+    T b = ax > bx ? ax : bx;
+    // sequence of declarations suitable for SIMD instructions
+    T  v = a * cm1 + b * c;
+    assert(isFinite(v));
+    R fv = f(v);
+    if (isNaN(fv) || fv == -T.infinity)
+    {
+        return typeof(return)(v, fv, T.init);
+    }
+    T  w = v;
+    R fw = fv;
+    T  x = v;
+    R fx = fv;
+    size_t i;
+    for (R d = 0, e = 0;;)
+    {
+        i++;
+        T m = (a + b) / 2;
+        // This fix is not part of the original algorithm
+        if (!isFinite(m)) // fix infinity loop. Issue can be reproduced in R.
+        {
+            m = a / 2 + b / 2;
+            if (!isFinite(m)) // fast-math compiler switch is enabled
+            {
+                //SIMD instructions can be used by compiler, do not reduce declarations
+                int a_exp = void;
+                int b_exp = void;
+                immutable an = frexp(a, a_exp);
+                immutable bn = frexp(b, b_exp);
+                immutable am = ldexp(an, a_exp-1);
+                immutable bm = ldexp(bn, b_exp-1);
+                m = am + bm;
+                if (!isFinite(m)) // wrong input: constraints are disabled in release mode
+                {
+                    return typeof(return).init;
+                }
+            }
+        }
+        tolerance = absTolerance * fabs(x) + relTolerance;
+        immutable t2 = tolerance * 2;
+        // check stopping criterion
+        if (!(fabs(x - m) > t2 - (b - a) / 2))
+        {
+            break;
+        }
+        R p = 0;
+        R q = 0;
+        R r = 0;
+        // fit parabola
+        if (fabs(e) > tolerance)
+        {
+            immutable  xw =  x -  w;
+            immutable fxw = fx - fw;
+            immutable  xv =  x -  v;
+            immutable fxv = fx - fv;
+            immutable xwfxv = xw * fxv;
+            immutable xvfxw = xv * fxw;
+            p = xv * xvfxw - xw * xwfxv;
+            q = (xvfxw - xwfxv) * 2;
+            if (q > 0)
+                p = -p;
+            else
+                q = -q;
+            r = e;
+            e = d;
+        }
+        T u;
+        // a parabolic-interpolation step
+        if (fabs(p) < fabs(q * r / 2) && p > q * (a - x) && p < q * (b - x))
+        {
+            d = p / q;
+            u = x + d;
+            // f must not be evaluated too close to a or b
+            if (u - a < t2 || b - u < t2)
+                d = x < m ? tolerance : -tolerance;
+        }
+        // a golden-section step
+        else
+        {
+            e = (x < m ? b : a) - x;
+            d = c * e;
+        }
+        // f must not be evaluated too close to x
+        u = x + (fabs(d) >= tolerance ? d : d > 0 ? tolerance : -tolerance);
+        immutable fu = f(u);
+        if (isNaN(fu) || fu == -T.infinity)
+        {
+            return typeof(return)(u, fu, T.init);
+        }
+        //  update  a, b, v, w, and x
+        if (fu <= fx)
+        {
+            u < x ? b : a = x;
+            v = w; fv = fw;
+            w = x; fw = fx;
+            x = u; fx = fu;
+        }
+        else
+        {
+            u < x ? a : b = u;
+            if (fu <= fw || w == x)
+            {
+                v = w; fv = fw;
+                w = u; fw = fu;
+            }
+            else if (fu <= fv || v == x || v == w)
+            { // do not remove this braces
+                v = u; fv = fu;
+            }
+        }
+    }
+    return typeof(return)(x, fx, tolerance * 3);
+}
+
+///
+@safe unittest
+{
+    import std.math : approxEqual;
+
+    auto ret = findLocalMin((double x) => (x-4)^^2, -1e7, 1e7);
+    assert(ret.x.approxEqual(4.0));
+    assert(ret.y.approxEqual(0.0));
+}
+
+@safe unittest
+{
+    import std.meta : AliasSeq;
+    foreach (T; AliasSeq!(double, float, real))
+    {
+        {
+            auto ret = findLocalMin!T((T x) => (x-4)^^2, T.min_normal, 1e7);
+            assert(ret.x.approxEqual(T(4)));
+            assert(ret.y.approxEqual(T(0)));
+        }
+        {
+            auto ret = findLocalMin!T((T x) => fabs(x-1), -T.max/4, T.max/4, T.min_normal, 2*T.epsilon);
+            assert(approxEqual(ret.x, T(1)));
+            assert(approxEqual(ret.y, T(0)));
+            assert(ret.error <= 10 * T.epsilon);
+        }
+        {
+            auto ret = findLocalMin!T((T x) => T.init, 0, 1, T.min_normal, 2*T.epsilon);
+            assert(!ret.x.isNaN);
+            assert(ret.y.isNaN);
+            assert(ret.error.isNaN);
+        }
+        {
+            auto ret = findLocalMin!T((T x) => log(x), 0, 1, T.min_normal, 2*T.epsilon);
+            assert(ret.error < 3.00001 * ((2*T.epsilon)*fabs(ret.x)+ T.min_normal));
+            assert(ret.x >= 0 && ret.x <= ret.error);
+        }
+        {
+            auto ret = findLocalMin!T((T x) => log(x), 0, T.max, T.min_normal, 2*T.epsilon);
+            assert(ret.y < -18);
+            assert(ret.error < 5e-08);
+            assert(ret.x >= 0 && ret.x <= ret.error);
+        }
+        {
+            auto ret = findLocalMin!T((T x) => -fabs(x), -1, 1, T.min_normal, 2*T.epsilon);
+            assert(ret.x.fabs.approxEqual(T(1)));
+            assert(ret.y.fabs.approxEqual(T(1)));
+            assert(ret.error.approxEqual(T(0)));
+        }
+    }
+}
+
+/**
+Computes $(LINK2 https://en.wikipedia.org/wiki/Euclidean_distance,
+Euclidean distance) between input ranges $(D a) and
+$(D b). The two ranges must have the same length. The three-parameter
+version stops computation as soon as the distance is greater than or
+equal to $(D limit) (this is useful to save computation if a small
+distance is sought).
+ */
+CommonType!(ElementType!(Range1), ElementType!(Range2))
+euclideanDistance(Range1, Range2)(Range1 a, Range2 b)
+if (isInputRange!(Range1) && isInputRange!(Range2))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        immutable t = a.front - b.front;
+        result += t * t;
+    }
+    static if (!haveLen) assert(b.empty);
+    return sqrt(result);
+}
+
+/// Ditto
+CommonType!(ElementType!(Range1), ElementType!(Range2))
+euclideanDistance(Range1, Range2, F)(Range1 a, Range2 b, F limit)
+if (isInputRange!(Range1) && isInputRange!(Range2))
+{
+    limit *= limit;
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    for (; ; a.popFront(), b.popFront())
+    {
+        if (a.empty)
+        {
+            static if (!haveLen) assert(b.empty);
+            break;
+        }
+        immutable t = a.front - b.front;
+        result += t * t;
+        if (result >= limit) break;
+    }
+    return sqrt(result);
+}
+
+@safe unittest
+{
+    import std.meta : AliasSeq;
+    foreach (T; AliasSeq!(double, const double, immutable double))
+    {
+        T[] a = [ 1.0, 2.0, ];
+        T[] b = [ 4.0, 6.0, ];
+        assert(euclideanDistance(a, b) == 5);
+        assert(euclideanDistance(a, b, 5) == 5);
+        assert(euclideanDistance(a, b, 4) == 5);
+        assert(euclideanDistance(a, b, 2) == 3);
+    }
+}
+
+/**
+Computes the $(LINK2 https://en.wikipedia.org/wiki/Dot_product,
+dot product) of input ranges $(D a) and $(D
+b). The two ranges must have the same length. If both ranges define
+length, the check is done once; otherwise, it is done at each
+iteration.
+ */
+CommonType!(ElementType!(Range1), ElementType!(Range2))
+dotProduct(Range1, Range2)(Range1 a, Range2 b)
+if (isInputRange!(Range1) && isInputRange!(Range2) &&
+    !(isArray!(Range1) && isArray!(Range2)))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        result += a.front * b.front;
+    }
+    static if (!haveLen) assert(b.empty);
+    return result;
+}
+
+/// Ditto
+CommonType!(F1, F2)
+dotProduct(F1, F2)(in F1[] avector, in F2[] bvector)
+{
+    immutable n = avector.length;
+    assert(n == bvector.length);
+    auto avec = avector.ptr, bvec = bvector.ptr;
+    Unqual!(typeof(return)) sum0 = 0, sum1 = 0;
+
+    const all_endp = avec + n;
+    const smallblock_endp = avec + (n & ~3);
+    const bigblock_endp = avec + (n & ~15);
+
+    for (; avec != bigblock_endp; avec += 16, bvec += 16)
+    {
+        sum0 += avec[0] * bvec[0];
+        sum1 += avec[1] * bvec[1];
+        sum0 += avec[2] * bvec[2];
+        sum1 += avec[3] * bvec[3];
+        sum0 += avec[4] * bvec[4];
+        sum1 += avec[5] * bvec[5];
+        sum0 += avec[6] * bvec[6];
+        sum1 += avec[7] * bvec[7];
+        sum0 += avec[8] * bvec[8];
+        sum1 += avec[9] * bvec[9];
+        sum0 += avec[10] * bvec[10];
+        sum1 += avec[11] * bvec[11];
+        sum0 += avec[12] * bvec[12];
+        sum1 += avec[13] * bvec[13];
+        sum0 += avec[14] * bvec[14];
+        sum1 += avec[15] * bvec[15];
+    }
+
+    for (; avec != smallblock_endp; avec += 4, bvec += 4)
+    {
+        sum0 += avec[0] * bvec[0];
+        sum1 += avec[1] * bvec[1];
+        sum0 += avec[2] * bvec[2];
+        sum1 += avec[3] * bvec[3];
+    }
+
+    sum0 += sum1;
+
+    /* Do trailing portion in naive loop. */
+    while (avec != all_endp)
+    {
+        sum0 += *avec * *bvec;
+        ++avec;
+        ++bvec;
+    }
+
+    return sum0;
+}
+
+@system unittest
+{
+    // @system due to dotProduct and assertCTFEable
+    import std.exception : assertCTFEable;
+    import std.meta : AliasSeq;
+    foreach (T; AliasSeq!(double, const double, immutable double))
+    {
+        T[] a = [ 1.0, 2.0, ];
+        T[] b = [ 4.0, 6.0, ];
+        assert(dotProduct(a, b) == 16);
+        assert(dotProduct([1, 3, -5], [4, -2, -1]) == 3);
+    }
+
+    // Make sure the unrolled loop codepath gets tested.
+    static const x =
+        [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18];
+    static const y =
+        [2.0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19];
+    assertCTFEable!({ assert(dotProduct(x, y) == 2280); });
+}
+
+/**
+Computes the $(LINK2 https://en.wikipedia.org/wiki/Cosine_similarity,
+cosine similarity) of input ranges $(D a) and $(D
+b). The two ranges must have the same length. If both ranges define
+length, the check is done once; otherwise, it is done at each
+iteration. If either range has all-zero elements, return 0.
+ */
+CommonType!(ElementType!(Range1), ElementType!(Range2))
+cosineSimilarity(Range1, Range2)(Range1 a, Range2 b)
+if (isInputRange!(Range1) && isInputRange!(Range2))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) norma = 0, normb = 0, dotprod = 0;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        immutable t1 = a.front, t2 = b.front;
+        norma += t1 * t1;
+        normb += t2 * t2;
+        dotprod += t1 * t2;
+    }
+    static if (!haveLen) assert(b.empty);
+    if (norma == 0 || normb == 0) return 0;
+    return dotprod / sqrt(norma * normb);
+}
+
+@safe unittest
+{
+    import std.meta : AliasSeq;
+    foreach (T; AliasSeq!(double, const double, immutable double))
+    {
+        T[] a = [ 1.0, 2.0, ];
+        T[] b = [ 4.0, 3.0, ];
+        assert(approxEqual(
+                    cosineSimilarity(a, b), 10.0 / sqrt(5.0 * 25),
+                    0.01));
+    }
+}
+
+/**
+Normalizes values in $(D range) by multiplying each element with a
+number chosen such that values sum up to $(D sum). If elements in $(D
+range) sum to zero, assigns $(D sum / range.length) to
+all. Normalization makes sense only if all elements in $(D range) are
+positive. $(D normalize) assumes that is the case without checking it.
+
+Returns: $(D true) if normalization completed normally, $(D false) if
+all elements in $(D range) were zero or if $(D range) is empty.
+ */
+bool normalize(R)(R range, ElementType!(R) sum = 1)
+if (isForwardRange!(R))
+{
+    ElementType!(R) s = 0;
+    // Step 1: Compute sum and length of the range
+    static if (hasLength!(R))
+    {
+        const length = range.length;
+        foreach (e; range)
+        {
+            s += e;
+        }
+    }
+    else
+    {
+        uint length = 0;
+        foreach (e; range)
+        {
+            s += e;
+            ++length;
+        }
+    }
+    // Step 2: perform normalization
+    if (s == 0)
+    {
+        if (length)
+        {
+            immutable f = sum / range.length;
+            foreach (ref e; range) e = f;
+        }
+        return false;
+    }
+    // The path most traveled
+    assert(s >= 0);
+    immutable f = sum / s;
+    foreach (ref e; range)
+        e *= f;
+    return true;
+}
+
+///
+@safe unittest
+{
+    double[] a = [];
+    assert(!normalize(a));
+    a = [ 1.0, 3.0 ];
+    assert(normalize(a));
+    assert(a == [ 0.25, 0.75 ]);
+    a = [ 0.0, 0.0 ];
+    assert(!normalize(a));
+    assert(a == [ 0.5, 0.5 ]);
+}
+
+/**
+Compute the sum of binary logarithms of the input range $(D r).
+The error of this method is much smaller than with a naive sum of log2.
+ */
+ElementType!Range sumOfLog2s(Range)(Range r)
+if (isInputRange!Range && isFloatingPoint!(ElementType!Range))
+{
+    long exp = 0;
+    Unqual!(typeof(return)) x = 1;
+    foreach (e; r)
+    {
+        if (e < 0)
+            return typeof(return).nan;
+        int lexp = void;
+        x *= frexp(e, lexp);
+        exp += lexp;
+        if (x < 0.5)
+        {
+            x *= 2;
+            exp--;
+        }
+    }
+    return exp + log2(x);
+}
+
+///
+@safe unittest
+{
+    import std.math : isNaN;
+
+    assert(sumOfLog2s(new double[0]) == 0);
+    assert(sumOfLog2s([0.0L]) == -real.infinity);
+    assert(sumOfLog2s([-0.0L]) == -real.infinity);
+    assert(sumOfLog2s([2.0L]) == 1);
+    assert(sumOfLog2s([-2.0L]).isNaN());
+    assert(sumOfLog2s([real.nan]).isNaN());
+    assert(sumOfLog2s([-real.nan]).isNaN());
+    assert(sumOfLog2s([real.infinity]) == real.infinity);
+    assert(sumOfLog2s([-real.infinity]).isNaN());
+    assert(sumOfLog2s([ 0.25, 0.25, 0.25, 0.125 ]) == -9);
+}
+
+/**
+Computes $(LINK2 https://en.wikipedia.org/wiki/Entropy_(information_theory),
+_entropy) of input range $(D r) in bits. This
+function assumes (without checking) that the values in $(D r) are all
+in $(D [0, 1]). For the entropy to be meaningful, often $(D r) should
+be normalized too (i.e., its values should sum to 1). The
+two-parameter version stops evaluating as soon as the intermediate
+result is greater than or equal to $(D max).
+ */
+ElementType!Range entropy(Range)(Range r)
+if (isInputRange!Range)
+{
+    Unqual!(typeof(return)) result = 0.0;
+    for (;!r.empty; r.popFront)
+    {
+        if (!r.front) continue;
+        result -= r.front * log2(r.front);
+    }
+    return result;
+}
+
+/// Ditto
+ElementType!Range entropy(Range, F)(Range r, F max)
+if (isInputRange!Range &&
+    !is(CommonType!(ElementType!Range, F) == void))
+{
+    Unqual!(typeof(return)) result = 0.0;
+    for (;!r.empty; r.popFront)
+    {
+        if (!r.front) continue;
+        result -= r.front * log2(r.front);
+        if (result >= max) break;
+    }
+    return result;
+}
+
+@safe unittest
+{
+    import std.meta : AliasSeq;
+    foreach (T; AliasSeq!(double, const double, immutable double))
+    {
+        T[] p = [ 0.0, 0, 0, 1 ];
+        assert(entropy(p) == 0);
+        p = [ 0.25, 0.25, 0.25, 0.25 ];
+        assert(entropy(p) == 2);
+        assert(entropy(p, 1) == 1);
+    }
+}
+
+/**
+Computes the $(LINK2 https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence,
+Kullback-Leibler divergence) between input ranges
+$(D a) and $(D b), which is the sum $(D ai * log(ai / bi)). The base
+of logarithm is 2. The ranges are assumed to contain elements in $(D
+[0, 1]). Usually the ranges are normalized probability distributions,
+but this is not required or checked by $(D
+kullbackLeiblerDivergence). If any element $(D bi) is zero and the
+corresponding element $(D ai) nonzero, returns infinity. (Otherwise,
+if $(D ai == 0 && bi == 0), the term $(D ai * log(ai / bi)) is
+considered zero.) If the inputs are normalized, the result is
+positive.
+ */
+CommonType!(ElementType!Range1, ElementType!Range2)
+kullbackLeiblerDivergence(Range1, Range2)(Range1 a, Range2 b)
+if (isInputRange!(Range1) && isInputRange!(Range2))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        immutable t1 = a.front;
+        if (t1 == 0) continue;
+        immutable t2 = b.front;
+        if (t2 == 0) return result.infinity;
+        assert(t1 > 0 && t2 > 0);
+        result += t1 * log2(t1 / t2);
+    }
+    static if (!haveLen) assert(b.empty);
+    return result;
+}
+
+///
+@safe unittest
+{
+    import std.math : approxEqual;
+
+    double[] p = [ 0.0, 0, 0, 1 ];
+    assert(kullbackLeiblerDivergence(p, p) == 0);
+    double[] p1 = [ 0.25, 0.25, 0.25, 0.25 ];
+    assert(kullbackLeiblerDivergence(p1, p1) == 0);
+    assert(kullbackLeiblerDivergence(p, p1) == 2);
+    assert(kullbackLeiblerDivergence(p1, p) == double.infinity);
+    double[] p2 = [ 0.2, 0.2, 0.2, 0.4 ];
+    assert(approxEqual(kullbackLeiblerDivergence(p1, p2), 0.0719281));
+    assert(approxEqual(kullbackLeiblerDivergence(p2, p1), 0.0780719));
+}
+
+/**
+Computes the $(LINK2 https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence,
+Jensen-Shannon divergence) between $(D a) and $(D
+b), which is the sum $(D (ai * log(2 * ai / (ai + bi)) + bi * log(2 *
+bi / (ai + bi))) / 2). The base of logarithm is 2. The ranges are
+assumed to contain elements in $(D [0, 1]). Usually the ranges are
+normalized probability distributions, but this is not required or
+checked by $(D jensenShannonDivergence). If the inputs are normalized,
+the result is bounded within $(D [0, 1]). The three-parameter version
+stops evaluations as soon as the intermediate result is greater than
+or equal to $(D limit).
+ */
+CommonType!(ElementType!Range1, ElementType!Range2)
+jensenShannonDivergence(Range1, Range2)(Range1 a, Range2 b)
+if (isInputRange!Range1 && isInputRange!Range2 &&
+    is(CommonType!(ElementType!Range1, ElementType!Range2)))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        immutable t1 = a.front;
+        immutable t2 = b.front;
+        immutable avg = (t1 + t2) / 2;
+        if (t1 != 0)
+        {
+            result += t1 * log2(t1 / avg);
+        }
+        if (t2 != 0)
+        {
+            result += t2 * log2(t2 / avg);
+        }
+    }
+    static if (!haveLen) assert(b.empty);
+    return result / 2;
+}
+
+/// Ditto
+CommonType!(ElementType!Range1, ElementType!Range2)
+jensenShannonDivergence(Range1, Range2, F)(Range1 a, Range2 b, F limit)
+if (isInputRange!Range1 && isInputRange!Range2 &&
+    is(typeof(CommonType!(ElementType!Range1, ElementType!Range2).init
+    >= F.init) : bool))
+{
+    enum bool haveLen = hasLength!(Range1) && hasLength!(Range2);
+    static if (haveLen) assert(a.length == b.length);
+    Unqual!(typeof(return)) result = 0;
+    limit *= 2;
+    for (; !a.empty; a.popFront(), b.popFront())
+    {
+        immutable t1 = a.front;
+        immutable t2 = b.front;
+        immutable avg = (t1 + t2) / 2;
+        if (t1 != 0)
+        {
+            result += t1 * log2(t1 / avg);
+        }
+        if (t2 != 0)
+        {
+            result += t2 * log2(t2 / avg);
+        }
+        if (result >= limit) break;
+    }
+    static if (!haveLen) assert(b.empty);
+    return result / 2;
+}
+
+///
+@safe unittest
+{
+    import std.math : approxEqual;
+
+    double[] p = [ 0.0, 0, 0, 1 ];
+    assert(jensenShannonDivergence(p, p) == 0);
+    double[] p1 = [ 0.25, 0.25, 0.25, 0.25 ];
+    assert(jensenShannonDivergence(p1, p1) == 0);
+    assert(approxEqual(jensenShannonDivergence(p1, p), 0.548795));
+    double[] p2 = [ 0.2, 0.2, 0.2, 0.4 ];
+    assert(approxEqual(jensenShannonDivergence(p1, p2), 0.0186218));
+    assert(approxEqual(jensenShannonDivergence(p2, p1), 0.0186218));
+    assert(approxEqual(jensenShannonDivergence(p2, p1, 0.005), 0.00602366));
+}
+
+/**
+The so-called "all-lengths gap-weighted string kernel" computes a
+similarity measure between $(D s) and $(D t) based on all of their
+common subsequences of all lengths. Gapped subsequences are also
+included.
+
+To understand what $(D gapWeightedSimilarity(s, t, lambda)) computes,
+consider first the case $(D lambda = 1) and the strings $(D s =
+["Hello", "brave", "new", "world"]) and $(D t = ["Hello", "new",
+"world"]). In that case, $(D gapWeightedSimilarity) counts the
+following matches:
+
+$(OL $(LI three matches of length 1, namely $(D "Hello"), $(D "new"),
+and $(D "world");) $(LI three matches of length 2, namely ($(D
+"Hello", "new")), ($(D "Hello", "world")), and ($(D "new", "world"));)
+$(LI one match of length 3, namely ($(D "Hello", "new", "world")).))
+
+The call $(D gapWeightedSimilarity(s, t, 1)) simply counts all of
+these matches and adds them up, returning 7.
+
+----
+string[] s = ["Hello", "brave", "new", "world"];
+string[] t = ["Hello", "new", "world"];
+assert(gapWeightedSimilarity(s, t, 1) == 7);
+----
+
+Note how the gaps in matching are simply ignored, for example ($(D
+"Hello", "new")) is deemed as good a match as ($(D "new",
+"world")). This may be too permissive for some applications. To
+eliminate gapped matches entirely, use $(D lambda = 0):
+
+----
+string[] s = ["Hello", "brave", "new", "world"];
+string[] t = ["Hello", "new", "world"];
+assert(gapWeightedSimilarity(s, t, 0) == 4);
+----
+
+The call above eliminated the gapped matches ($(D "Hello", "new")),
+($(D "Hello", "world")), and ($(D "Hello", "new", "world")) from the
+tally. That leaves only 4 matches.
+
+The most interesting case is when gapped matches still participate in
+the result, but not as strongly as ungapped matches. The result will
+be a smooth, fine-grained similarity measure between the input
+strings. This is where values of $(D lambda) between 0 and 1 enter
+into play: gapped matches are $(I exponentially penalized with the
+number of gaps) with base $(D lambda). This means that an ungapped
+match adds 1 to the return value; a match with one gap in either
+string adds $(D lambda) to the return value; ...; a match with a total
+of $(D n) gaps in both strings adds $(D pow(lambda, n)) to the return
+value. In the example above, we have 4 matches without gaps, 2 matches
+with one gap, and 1 match with three gaps. The latter match is ($(D
+"Hello", "world")), which has two gaps in the first string and one gap
+in the second string, totaling to three gaps. Summing these up we get
+$(D 4 + 2 * lambda + pow(lambda, 3)).
+
+----
+string[] s = ["Hello", "brave", "new", "world"];
+string[] t = ["Hello", "new", "world"];
+assert(gapWeightedSimilarity(s, t, 0.5) == 4 + 0.5 * 2 + 0.125);
+----
+
+$(D gapWeightedSimilarity) is useful wherever a smooth similarity
+measure between sequences allowing for approximate matches is
+needed. The examples above are given with words, but any sequences
+with elements comparable for equality are allowed, e.g. characters or
+numbers. $(D gapWeightedSimilarity) uses a highly optimized dynamic
+programming implementation that needs $(D 16 * min(s.length,
+t.length)) extra bytes of memory and $(BIGOH s.length * t.length) time
+to complete.
+ */
+F gapWeightedSimilarity(alias comp = "a == b", R1, R2, F)(R1 s, R2 t, F lambda)
+if (isRandomAccessRange!(R1) && hasLength!(R1) &&
+    isRandomAccessRange!(R2) && hasLength!(R2))
+{
+    import core.exception : onOutOfMemoryError;
+    import core.stdc.stdlib : malloc, free;
+    import std.algorithm.mutation : swap;
+    import std.functional : binaryFun;
+
+    if (s.length < t.length) return gapWeightedSimilarity(t, s, lambda);
+    if (!t.length) return 0;
+
+    auto dpvi = cast(F*) malloc(F.sizeof * 2 * t.length);
+    if (!dpvi)
+        onOutOfMemoryError();
+
+    auto dpvi1 = dpvi + t.length;
+    scope(exit) free(dpvi < dpvi1 ? dpvi : dpvi1);
+    dpvi[0 .. t.length] = 0;
+    dpvi1[0] = 0;
+    immutable lambda2 = lambda * lambda;
+
+    F result = 0;
+    foreach (i; 0 .. s.length)
+    {
+        const si = s[i];
+        for (size_t j = 0;;)
+        {
+            F dpsij = void;
+            if (binaryFun!(comp)(si, t[j]))
+            {
+                dpsij = 1 + dpvi[j];
+                result += dpsij;
+            }
+            else
+            {
+                dpsij = 0;
+            }
+            immutable j1 = j + 1;
+            if (j1 == t.length) break;
+            dpvi1[j1] = dpsij + lambda * (dpvi1[j] + dpvi[j1]) -
+                        lambda2 * dpvi[j];
+            j = j1;
+        }
+        swap(dpvi, dpvi1);
+    }
+    return result;
+}
+
+@system unittest
+{
+    string[] s = ["Hello", "brave", "new", "world"];
+    string[] t = ["Hello", "new", "world"];
+    assert(gapWeightedSimilarity(s, t, 1) == 7);
+    assert(gapWeightedSimilarity(s, t, 0) == 4);
+    assert(gapWeightedSimilarity(s, t, 0.5) == 4 + 2 * 0.5 + 0.125);
+}
+
+/**
+The similarity per $(D gapWeightedSimilarity) has an issue in that it
+grows with the lengths of the two strings, even though the strings are
+not actually very similar. For example, the range $(D ["Hello",
+"world"]) is increasingly similar with the range $(D ["Hello",
+"world", "world", "world",...]) as more instances of $(D "world") are
+appended. To prevent that, $(D gapWeightedSimilarityNormalized)
+computes a normalized version of the similarity that is computed as
+$(D gapWeightedSimilarity(s, t, lambda) /
+sqrt(gapWeightedSimilarity(s, t, lambda) * gapWeightedSimilarity(s, t,
+lambda))). The function $(D gapWeightedSimilarityNormalized) (a
+so-called normalized kernel) is bounded in $(D [0, 1]), reaches $(D 0)
+only for ranges that don't match in any position, and $(D 1) only for
+identical ranges.
+
+The optional parameters $(D sSelfSim) and $(D tSelfSim) are meant for
+avoiding duplicate computation. Many applications may have already
+computed $(D gapWeightedSimilarity(s, s, lambda)) and/or $(D
+gapWeightedSimilarity(t, t, lambda)). In that case, they can be passed
+as $(D sSelfSim) and $(D tSelfSim), respectively.
+ */
+Select!(isFloatingPoint!(F), F, double)
+gapWeightedSimilarityNormalized(alias comp = "a == b", R1, R2, F)
+        (R1 s, R2 t, F lambda, F sSelfSim = F.init, F tSelfSim = F.init)
+if (isRandomAccessRange!(R1) && hasLength!(R1) &&
+    isRandomAccessRange!(R2) && hasLength!(R2))
+{
+    static bool uncomputed(F n)
+    {
+        static if (isFloatingPoint!(F))
+            return isNaN(n);
+        else
+            return n == n.init;
+    }
+    if (uncomputed(sSelfSim))
+        sSelfSim = gapWeightedSimilarity!(comp)(s, s, lambda);
+    if (sSelfSim == 0) return 0;
+    if (uncomputed(tSelfSim))
+        tSelfSim = gapWeightedSimilarity!(comp)(t, t, lambda);
+    if (tSelfSim == 0) return 0;
+
+    return gapWeightedSimilarity!(comp)(s, t, lambda) /
+           sqrt(cast(typeof(return)) sSelfSim * tSelfSim);
+}
+
+///
+@system unittest
+{
+    import std.math : approxEqual, sqrt;
+
+    string[] s = ["Hello", "brave", "new", "world"];
+    string[] t = ["Hello", "new", "world"];
+    assert(gapWeightedSimilarity(s, s, 1) == 15);
+    assert(gapWeightedSimilarity(t, t, 1) == 7);
+    assert(gapWeightedSimilarity(s, t, 1) == 7);
+    assert(approxEqual(gapWeightedSimilarityNormalized(s, t, 1),
+                    7.0 / sqrt(15.0 * 7), 0.01));
+}
+
+/**
+Similar to $(D gapWeightedSimilarity), just works in an incremental
+manner by first revealing the matches of length 1, then gapped matches
+of length 2, and so on. The memory requirement is $(BIGOH s.length *
+t.length). The time complexity is $(BIGOH s.length * t.length) time
+for computing each step. Continuing on the previous example:
+
+The implementation is based on the pseudocode in Fig. 4 of the paper
+$(HTTP jmlr.csail.mit.edu/papers/volume6/rousu05a/rousu05a.pdf,
+"Efﬁcient Computation of Gapped Substring Kernels on Large Alphabets")
+by Rousu et al., with additional algorithmic and systems-level
+optimizations.
+ */
+struct GapWeightedSimilarityIncremental(Range, F = double)
+if (isRandomAccessRange!(Range) && hasLength!(Range))
+{
+    import core.stdc.stdlib : malloc, realloc, alloca, free;
+
+private:
+    Range s, t;
+    F currentValue = 0;
+    F* kl;
+    size_t gram = void;
+    F lambda = void, lambda2 = void;
+
+public:
+/**
+Constructs an object given two ranges $(D s) and $(D t) and a penalty
+$(D lambda). Constructor completes in $(BIGOH s.length * t.length)
+time and computes all matches of length 1.
+ */
+    this(Range s, Range t, F lambda)
+    {
+        import core.exception : onOutOfMemoryError;
+
+        assert(lambda > 0);
+        this.gram = 0;
+        this.lambda = lambda;
+        this.lambda2 = lambda * lambda; // for efficiency only
+
+        size_t iMin = size_t.max, jMin = size_t.max,
+            iMax = 0, jMax = 0;
+        /* initialize */
+        Tuple!(size_t, size_t) * k0;
+        size_t k0len;
+        scope(exit) free(k0);
+        currentValue = 0;
+        foreach (i, si; s)
+        {
+            foreach (j; 0 .. t.length)
+            {
+                if (si != t[j]) continue;
+                k0 = cast(typeof(k0)) realloc(k0, ++k0len * (*k0).sizeof);
+                with (k0[k0len - 1])
+                {
+                    field[0] = i;
+                    field[1] = j;
+                }
+                // Maintain the minimum and maximum i and j
+                if (iMin > i) iMin = i;
+                if (iMax < i) iMax = i;
+                if (jMin > j) jMin = j;
+                if (jMax < j) jMax = j;
+            }
+        }
+
+        if (iMin > iMax) return;
+        assert(k0len);
+
+        currentValue = k0len;
+        // Chop strings down to the useful sizes
+        s = s[iMin .. iMax + 1];
+        t = t[jMin .. jMax + 1];
+        this.s = s;
+        this.t = t;
+
+        kl = cast(F*) malloc(s.length * t.length * F.sizeof);
+        if (!kl)
+            onOutOfMemoryError();
+
+        kl[0 .. s.length * t.length] = 0;
+        foreach (pos; 0 .. k0len)
+        {
+            with (k0[pos])
+            {
+                kl[(field[0] - iMin) * t.length + field[1] -jMin] = lambda2;
+            }
+        }
+    }
+
+    /**
+    Returns: $(D this).
+     */
+    ref GapWeightedSimilarityIncremental opSlice()
+    {
+        return this;
+    }
+
+    /**
+    Computes the match of the popFront length. Completes in $(BIGOH s.length *
+    t.length) time.
+     */
+    void popFront()
+    {
+        import std.algorithm.mutation : swap;
+
+        // This is a large source of optimization: if similarity at
+        // the gram-1 level was 0, then we can safely assume
+        // similarity at the gram level is 0 as well.
+        if (empty) return;
+
+        // Now attempt to match gapped substrings of length `gram'
+        ++gram;
+        currentValue = 0;
+
+        auto Si = cast(F*) alloca(t.length * F.sizeof);
+        Si[0 .. t.length] = 0;
+        foreach (i; 0 .. s.length)
+        {
+            const si = s[i];
+            F Sij_1 = 0;
+            F Si_1j_1 = 0;
+            auto kli = kl + i * t.length;
+            for (size_t j = 0;;)
+            {
+                const klij = kli[j];
+                const Si_1j = Si[j];
+                const tmp = klij + lambda * (Si_1j + Sij_1) - lambda2 * Si_1j_1;
+                // now update kl and currentValue
+                if (si == t[j])
+                    currentValue += kli[j] = lambda2 * Si_1j_1;
+                else
+                    kli[j] = 0;
+                // commit to Si
+                Si[j] = tmp;
+                if (++j == t.length) break;
+                // get ready for the popFront step; virtually increment j,
+                // so essentially stuffj_1 <-- stuffj
+                Si_1j_1 = Si_1j;
+                Sij_1 = tmp;
+            }
+        }
+        currentValue /= pow(lambda, 2 * (gram + 1));
+
+        version (none)
+        {
+            Si_1[0 .. t.length] = 0;
+            kl[0 .. min(t.length, maxPerimeter + 1)] = 0;
+            foreach (i; 1 .. min(s.length, maxPerimeter + 1))
+            {
+                auto kli = kl + i * t.length;
+                assert(s.length > i);
+                const si = s[i];
+                auto kl_1i_1 = kl_1 + (i - 1) * t.length;
+                kli[0] = 0;
+                F lastS = 0;
+                foreach (j; 1 .. min(maxPerimeter - i + 1, t.length))
+                {
+                    immutable j_1 = j - 1;
+                    immutable tmp = kl_1i_1[j_1]
+                        + lambda * (Si_1[j] + lastS)
+                        - lambda2 * Si_1[j_1];
+                    kl_1i_1[j_1] = float.nan;
+                    Si_1[j_1] = lastS;
+                    lastS = tmp;
+                    if (si == t[j])
+                    {
+                        currentValue += kli[j] = lambda2 * lastS;
+                    }
+                    else
+                    {
+                        kli[j] = 0;
+                    }
+                }
+                Si_1[t.length - 1] = lastS;
+            }
+            currentValue /= pow(lambda, 2 * (gram + 1));
+            // get ready for the popFront computation
+            swap(kl, kl_1);
+        }
+    }
+
+    /**
+    Returns: The gapped similarity at the current match length (initially
+    1, grows with each call to $(D popFront)).
+    */
+    @property F front() { return currentValue; }
+
+    /**
+    Returns: Whether there are more matches.
+     */
+    @property bool empty()
+    {
+        if (currentValue) return false;
+        if (kl)
+        {
+            free(kl);
+            kl = null;
+        }
+        return true;
+    }
+}
+
+/**
+Ditto
+ */
+GapWeightedSimilarityIncremental!(R, F) gapWeightedSimilarityIncremental(R, F)
+(R r1, R r2, F penalty)
+{
+    return typeof(return)(r1, r2, penalty);
+}
+
+///
+@system unittest
+{
+    string[] s = ["Hello", "brave", "new", "world"];
+    string[] t = ["Hello", "new", "world"];
+    auto simIter = gapWeightedSimilarityIncremental(s, t, 1.0);
+    assert(simIter.front == 3); // three 1-length matches
+    simIter.popFront();
+    assert(simIter.front == 3); // three 2-length matches
+    simIter.popFront();
+    assert(simIter.front == 1); // one 3-length match
+    simIter.popFront();
+    assert(simIter.empty);     // no more match
+}
+
+@system unittest
+{
+    import std.conv : text;
+    string[] s = ["Hello", "brave", "new", "world"];
+    string[] t = ["Hello", "new", "world"];
+    auto simIter = gapWeightedSimilarityIncremental(s, t, 1.0);
+    //foreach (e; simIter) writeln(e);
+    assert(simIter.front == 3); // three 1-length matches
+    simIter.popFront();
+    assert(simIter.front == 3, text(simIter.front)); // three 2-length matches
+    simIter.popFront();
+    assert(simIter.front == 1); // one 3-length matches
+    simIter.popFront();
+    assert(simIter.empty);     // no more match
+
+    s = ["Hello"];
+    t = ["bye"];
+    simIter = gapWeightedSimilarityIncremental(s, t, 0.5);
+    assert(simIter.empty);
+
+    s = ["Hello"];
+    t = ["Hello"];
+    simIter = gapWeightedSimilarityIncremental(s, t, 0.5);
+    assert(simIter.front == 1); // one match
+    simIter.popFront();
+    assert(simIter.empty);
+
+    s = ["Hello", "world"];
+    t = ["Hello"];
+    simIter = gapWeightedSimilarityIncremental(s, t, 0.5);
+    assert(simIter.front == 1); // one match
+    simIter.popFront();
+    assert(simIter.empty);
+
+    s = ["Hello", "world"];
+    t = ["Hello", "yah", "world"];
+    simIter = gapWeightedSimilarityIncremental(s, t, 0.5);
+    assert(simIter.front == 2); // two 1-gram matches
+    simIter.popFront();
+    assert(simIter.front == 0.5, text(simIter.front)); // one 2-gram match, 1 gap
+}
+
+@system unittest
+{
+    GapWeightedSimilarityIncremental!(string[]) sim =
+        GapWeightedSimilarityIncremental!(string[])(
+            ["nyuk", "I", "have", "no", "chocolate", "giba"],
+            ["wyda", "I", "have", "I", "have", "have", "I", "have", "hehe"],
+            0.5);
+    double[] witness = [ 7.0, 4.03125, 0, 0 ];
+    foreach (e; sim)
+    {
+        //writeln(e);
+        assert(e == witness.front);
+        witness.popFront();
+    }
+    witness = [ 3.0, 1.3125, 0.25 ];
+    sim = GapWeightedSimilarityIncremental!(string[])(
+        ["I", "have", "no", "chocolate"],
+        ["I", "have", "some", "chocolate"],
+        0.5);
+    foreach (e; sim)
+    {
+        //writeln(e);
+        assert(e == witness.front);
+        witness.popFront();
+    }
+    assert(witness.empty);
+}
+
+/**
+Computes the greatest common divisor of $(D a) and $(D b) by using
+an efficient algorithm such as $(HTTPS en.wikipedia.org/wiki/Euclidean_algorithm, Euclid's)
+or $(HTTPS en.wikipedia.org/wiki/Binary_GCD_algorithm, Stein's) algorithm.
+
+Params:
+    T = Any numerical type that supports the modulo operator `%`. If
+        bit-shifting `<<` and `>>` are also supported, Stein's algorithm will
+        be used; otherwise, Euclid's algorithm is used as _a fallback.
+Returns:
+    The greatest common divisor of the given arguments.
+ */
+T gcd(T)(T a, T b)
+    if (isIntegral!T)
+{
+    static if (is(T == const) || is(T == immutable))
+    {
+        return gcd!(Unqual!T)(a, b);
+    }
+    else version (DigitalMars)
+    {
+        static if (T.min < 0)
+        {
+            assert(a >= 0 && b >= 0);
+        }
+        while (b)
+        {
+            immutable t = b;
+            b = a % b;
+            a = t;
+        }
+        return a;
+    }
+    else
+    {
+        if (a == 0)
+            return b;
+        if (b == 0)
+            return a;
+
+        import core.bitop : bsf;
+        import std.algorithm.mutation : swap;
+
+        immutable uint shift = bsf(a | b);
+        a >>= a.bsf;
+
+        do
+        {
+            b >>= b.bsf;
+            if (a > b)
+                swap(a, b);
+            b -= a;
+        } while (b);
+
+        return a << shift;
+    }
+}
+
+///
+@safe unittest
+{
+    assert(gcd(2 * 5 * 7 * 7, 5 * 7 * 11) == 5 * 7);
+    const int a = 5 * 13 * 23 * 23, b = 13 * 59;
+    assert(gcd(a, b) == 13);
+}
+
+// This overload is for non-builtin numerical types like BigInt or
+// user-defined types.
+/// ditto
+T gcd(T)(T a, T b)
+    if (!isIntegral!T &&
+        is(typeof(T.init % T.init)) &&
+        is(typeof(T.init == 0 || T.init > 0)))
+{
+    import std.algorithm.mutation : swap;
+
+    enum canUseBinaryGcd = is(typeof(() {
+        T t, u;
+        t <<= 1;
+        t >>= 1;
+        t -= u;
+        bool b = (t & 1) == 0;
+        swap(t, u);
+    }));
+
+    assert(a >= 0 && b >= 0);
+
+    static if (canUseBinaryGcd)
+    {
+        uint shift = 0;
+        while ((a & 1) == 0 && (b & 1) == 0)
+        {
+            a >>= 1;
+            b >>= 1;
+            shift++;
+        }
+
+        do
+        {
+            assert((a & 1) != 0);
+            while ((b & 1) == 0)
+                b >>= 1;
+            if (a > b)
+                swap(a, b);
+            b -= a;
+        } while (b);
+
+        return a << shift;
+    }
+    else
+    {
+        // The only thing we have is %; fallback to Euclidean algorithm.
+        while (b != 0)
+        {
+            auto t = b;
+            b = a % b;
+            a = t;
+        }
+        return a;
+    }
+}
+
+// Issue 7102
+@system pure unittest
+{
+    import std.bigint : BigInt;
+    assert(gcd(BigInt("71_000_000_000_000_000_000"),
+               BigInt("31_000_000_000_000_000_000")) ==
+           BigInt("1_000_000_000_000_000_000"));
+}
+
+@safe pure nothrow unittest
+{
+    // A numerical type that only supports % and - (to force gcd implementation
+    // to use Euclidean algorithm).
+    struct CrippledInt
+    {
+        int impl;
+        CrippledInt opBinary(string op : "%")(CrippledInt i)
+        {
+            return CrippledInt(impl % i.impl);
+        }
+        int opEquals(CrippledInt i) { return impl == i.impl; }
+        int opEquals(int i) { return impl == i; }
+        int opCmp(int i) { return (impl < i) ? -1 : (impl > i) ? 1 : 0; }
+    }
+    assert(gcd(CrippledInt(2310), CrippledInt(1309)) == CrippledInt(77));
+}
+
+// This is to make tweaking the speed/size vs. accuracy tradeoff easy,
+// though floats seem accurate enough for all practical purposes, since
+// they pass the "approxEqual(inverseFft(fft(arr)), arr)" test even for
+// size 2 ^^ 22.
+private alias lookup_t = float;
+
+/**A class for performing fast Fourier transforms of power of two sizes.
+ * This class encapsulates a large amount of state that is reusable when
+ * performing multiple FFTs of sizes smaller than or equal to that specified
+ * in the constructor.  This results in substantial speedups when performing
+ * multiple FFTs with a known maximum size.  However,
+ * a free function API is provided for convenience if you need to perform a
+ * one-off FFT.
+ *
+ * References:
+ * $(HTTP en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)
+ */
+final class Fft
+{
+    import core.bitop : bsf;
+    import std.algorithm.iteration : map;
+    import std.array : uninitializedArray;
+
+private:
+    immutable lookup_t[][] negSinLookup;
+
+    void enforceSize(R)(R range) const
+    {
+        import std.conv : text;
+        assert(range.length <= size, text(
+            "FFT size mismatch.  Expected ", size, ", got ", range.length));
+    }
+
+    void fftImpl(Ret, R)(Stride!R range, Ret buf) const
+    in
+    {
+        assert(range.length >= 4);
+        assert(isPowerOf2(range.length));
+    }
+    body
+    {
+        auto recurseRange = range;
+        recurseRange.doubleSteps();
+
+        if (buf.length > 4)
+        {
+            fftImpl(recurseRange, buf[0..$ / 2]);
+            recurseRange.popHalf();
+            fftImpl(recurseRange, buf[$ / 2..$]);
+        }
+        else
+        {
+            // Do this here instead of in another recursion to save on
+            // recursion overhead.
+            slowFourier2(recurseRange, buf[0..$ / 2]);
+            recurseRange.popHalf();
+            slowFourier2(recurseRange, buf[$ / 2..$]);
+        }
+
+        butterfly(buf);
+    }
+
+    // This algorithm works by performing the even and odd parts of our FFT
+    // using the "two for the price of one" method mentioned at
+    // http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM#Head521
+    // by making the odd terms into the imaginary components of our new FFT,
+    // and then using symmetry to recombine them.
+    void fftImplPureReal(Ret, R)(R range, Ret buf) const
+    in
+    {
+        assert(range.length >= 4);
+        assert(isPowerOf2(range.length));
+    }
+    body
+    {
+        alias E = ElementType!R;
+
+        // Converts odd indices of range to the imaginary components of
+        // a range half the size.  The even indices become the real components.
+        static if (isArray!R && isFloatingPoint!E)
+        {
+            // Then the memory layout of complex numbers provides a dirt
+            // cheap way to convert.  This is a common case, so take advantage.
+            auto oddsImag = cast(Complex!E[]) range;
+        }
+        else
+        {
+            // General case:  Use a higher order range.  We can assume
+            // source.length is even because it has to be a power of 2.
+            static struct OddToImaginary
+            {
+                R source;
+                alias C = Complex!(CommonType!(E, typeof(buf[0].re)));
+
+                @property
+                {
+                    C front()
+                    {
+                        return C(source[0], source[1]);
+                    }
+
+                    C back()
+                    {
+                        immutable n = source.length;
+                        return C(source[n - 2], source[n - 1]);
+                    }
+
+                    typeof(this) save()
+                    {
+                        return typeof(this)(source.save);
+                    }
+
+                    bool empty()
+                    {
+                        return source.empty;
+                    }
+
+                    size_t length()
+                    {
+                        return source.length / 2;
+                    }
+                }
+
+                void popFront()
+                {
+                    source.popFront();
+                    source.popFront();
+                }
+
+                void popBack()
+                {
+                    source.popBack();
+                    source.popBack();
+                }
+
+                C opIndex(size_t index)
+                {
+                    return C(source[index * 2], source[index * 2 + 1]);
+                }
+
+                typeof(this) opSlice(size_t lower, size_t upper)
+                {
+                    return typeof(this)(source[lower * 2 .. upper * 2]);
+                }
+            }
+
+            auto oddsImag = OddToImaginary(range);
+        }
+
+        fft(oddsImag, buf[0..$ / 2]);
+        auto evenFft = buf[0..$ / 2];
+        auto oddFft = buf[$ / 2..$];
+        immutable halfN = evenFft.length;
+        oddFft[0].re = buf[0].im;
+        oddFft[0].im = 0;
+        evenFft[0].im = 0;
+        // evenFft[0].re is already right b/c it's aliased with buf[0].re.
+
+        foreach (k; 1 .. halfN / 2 + 1)
+        {
+            immutable bufk = buf[k];
+            immutable bufnk = buf[buf.length / 2 - k];
+            evenFft[k].re = 0.5 * (bufk.re + bufnk.re);
+            evenFft[halfN - k].re = evenFft[k].re;
+            evenFft[k].im = 0.5 * (bufk.im - bufnk.im);
+            evenFft[halfN - k].im = -evenFft[k].im;
+
+            oddFft[k].re = 0.5 * (bufk.im + bufnk.im);
+            oddFft[halfN - k].re = oddFft[k].re;
+            oddFft[k].im = 0.5 * (bufnk.re - bufk.re);
+            oddFft[halfN - k].im = -oddFft[k].im;
+        }
+
+        butterfly(buf);
+    }
+
+    void butterfly(R)(R buf) const
+    in
+    {
+        assert(isPowerOf2(buf.length));
+    }
+    body
+    {
+        immutable n = buf.length;
+        immutable localLookup = negSinLookup[bsf(n)];
+        assert(localLookup.length == n);
+
+        immutable cosMask = n - 1;
+        immutable cosAdd = n / 4 * 3;
+
+        lookup_t negSinFromLookup(size_t index) pure nothrow
+        {
+            return localLookup[index];
+        }
+
+        lookup_t cosFromLookup(size_t index) pure nothrow
+        {
+            // cos is just -sin shifted by PI * 3 / 2.
+            return localLookup[(index + cosAdd) & cosMask];
+        }
+
+        immutable halfLen = n / 2;
+
+        // This loop is unrolled and the two iterations are interleaved
+        // relative to the textbook FFT to increase ILP.  This gives roughly 5%
+        // speedups on DMD.
+        for (size_t k = 0; k < halfLen; k += 2)
+        {
+            immutable cosTwiddle1 = cosFromLookup(k);
+            immutable sinTwiddle1 = negSinFromLookup(k);
+            immutable cosTwiddle2 = cosFromLookup(k + 1);
+            immutable sinTwiddle2 = negSinFromLookup(k + 1);
+
+            immutable realLower1 = buf[k].re;
+            immutable imagLower1 = buf[k].im;
+            immutable realLower2 = buf[k + 1].re;
+            immutable imagLower2 = buf[k + 1].im;
+
+            immutable upperIndex1 = k + halfLen;
+            immutable upperIndex2 = upperIndex1 + 1;
+            immutable realUpper1 = buf[upperIndex1].re;
+            immutable imagUpper1 = buf[upperIndex1].im;
+            immutable realUpper2 = buf[upperIndex2].re;
+            immutable imagUpper2 = buf[upperIndex2].im;
+
+            immutable realAdd1 = cosTwiddle1 * realUpper1
+                               - sinTwiddle1 * imagUpper1;
+            immutable imagAdd1 = sinTwiddle1 * realUpper1
+                               + cosTwiddle1 * imagUpper1;
+            immutable realAdd2 = cosTwiddle2 * realUpper2
+                               - sinTwiddle2 * imagUpper2;
+            immutable imagAdd2 = sinTwiddle2 * realUpper2
+                               + cosTwiddle2 * imagUpper2;
+
+            buf[k].re += realAdd1;
+            buf[k].im += imagAdd1;
+            buf[k + 1].re += realAdd2;
+            buf[k + 1].im += imagAdd2;
+
+            buf[upperIndex1].re = realLower1 - realAdd1;
+            buf[upperIndex1].im = imagLower1 - imagAdd1;
+            buf[upperIndex2].re = realLower2 - realAdd2;
+            buf[upperIndex2].im = imagLower2 - imagAdd2;
+        }
+    }
+
+    // This constructor is used within this module for allocating the
+    // buffer space elsewhere besides the GC heap.  It's definitely **NOT**
+    // part of the public API and definitely **IS** subject to change.
+    //
+    // Also, this is unsafe because the memSpace buffer will be cast
+    // to immutable.
+    public this(lookup_t[] memSpace)  // Public b/c of bug 4636.
+    {
+        immutable size = memSpace.length / 2;
+
+        /* Create a lookup table of all negative sine values at a resolution of
+         * size and all smaller power of two resolutions.  This may seem
+         * inefficient, but having all the lookups be next to each other in
+         * memory at every level of iteration is a huge win performance-wise.
+         */
+        if (size == 0)
+        {
+            return;
+        }
+
+        assert(isPowerOf2(size),
+            "Can only do FFTs on ranges with a size that is a power of two.");
+
+        auto table = new lookup_t[][bsf(size) + 1];
+
+        table[$ - 1] = memSpace[$ - size..$];
+        memSpace = memSpace[0 .. size];
+
+        auto lastRow = table[$ - 1];
+        lastRow[0] = 0;  // -sin(0) == 0.
+        foreach (ptrdiff_t i; 1 .. size)
+        {
+            // The hard coded cases are for improved accuracy and to prevent
+            // annoying non-zeroness when stuff should be zero.
+
+            if (i == size / 4)
+                lastRow[i] = -1;  // -sin(pi / 2) == -1.
+            else if (i == size / 2)
+                lastRow[i] = 0;   // -sin(pi) == 0.
+            else if (i == size * 3 / 4)
+                lastRow[i] = 1;  // -sin(pi * 3 / 2) == 1
+            else
+                lastRow[i] = -sin(i * 2.0L * PI / size);
+        }
+
+        // Fill in all the other rows with strided versions.
+        foreach (i; 1 .. table.length - 1)
+        {
+            immutable strideLength = size / (2 ^^ i);
+            auto strided = Stride!(lookup_t[])(lastRow, strideLength);
+            table[i] = memSpace[$ - strided.length..$];
+            memSpace = memSpace[0..$ - strided.length];
+
+            size_t copyIndex;
+            foreach (elem; strided)
+            {
+                table[i][copyIndex++] = elem;
+            }
+        }
+
+        negSinLookup = cast(immutable) table;
+    }
+
+public:
+    /**Create an $(D Fft) object for computing fast Fourier transforms of
+     * power of two sizes of $(D size) or smaller.  $(D size) must be a
+     * power of two.
+     */
+    this(size_t size)
+    {
+        // Allocate all twiddle factor buffers in one contiguous block so that,
+        // when one is done being used, the next one is next in cache.
+        auto memSpace = uninitializedArray!(lookup_t[])(2 * size);
+        this(memSpace);
+    }
+
+    @property size_t size() const
+    {
+        return (negSinLookup is null) ? 0 : negSinLookup[$ - 1].length;
+    }
+
+    /**Compute the Fourier transform of range using the $(BIGOH N log N)
+     * Cooley-Tukey Algorithm.  $(D range) must be a random-access range with
+     * slicing and a length equal to $(D size) as provided at the construction of
+     * this object.  The contents of range can be either  numeric types,
+     * which will be interpreted as pure real values, or complex types with
+     * properties or members $(D .re) and $(D .im) that can be read.
+     *
+     * Note:  Pure real FFTs are automatically detected and the relevant
+     *        optimizations are performed.
+     *
+     * Returns:  An array of complex numbers representing the transformed data in
+     *           the frequency domain.
+     *
+     * Conventions: The exponent is negative and the factor is one,
+     *              i.e., output[j] := sum[ exp(-2 PI i j k / N) input[k] ].
+     */
+    Complex!F[] fft(F = double, R)(R range) const
+        if (isFloatingPoint!F && isRandomAccessRange!R)
+    {
+        enforceSize(range);
+        Complex!F[] ret;
+        if (range.length == 0)
+        {
+            return ret;
+        }
+
+        // Don't waste time initializing the memory for ret.
+        ret = uninitializedArray!(Complex!F[])(range.length);
+
+        fft(range,  ret);
+        return ret;
+    }
+
+    /**Same as the overload, but allows for the results to be stored in a user-
+     * provided buffer.  The buffer must be of the same length as range, must be
+     * a random-access range, must have slicing, and must contain elements that are
+     * complex-like.  This means that they must have a .re and a .im member or
+     * property that can be both read and written and are floating point numbers.
+     */
+    void fft(Ret, R)(R range, Ret buf) const
+        if (isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret)
+    {
+        assert(buf.length == range.length);
+        enforceSize(range);
+
+        if (range.length == 0)
+        {
+            return;
+        }
+        else if (range.length == 1)
+        {
+            buf[0] = range[0];
+            return;
+        }
+        else if (range.length == 2)
+        {
+            slowFourier2(range, buf);
+            return;
+        }
+        else
+        {
+            alias E = ElementType!R;
+            static if (is(E : real))
+            {
+                return fftImplPureReal(range, buf);
+            }
+            else
+            {
+                static if (is(R : Stride!R))
+                    return fftImpl(range, buf);
+                else
+                    return fftImpl(Stride!R(range, 1), buf);
+            }
+        }
+    }
+
+    /**
+     * Computes the inverse Fourier transform of a range.  The range must be a
+     * random access range with slicing, have a length equal to the size
+     * provided at construction of this object, and contain elements that are
+     * either of type std.complex.Complex or have essentially
+     * the same compile-time interface.
+     *
+     * Returns:  The time-domain signal.
+     *
+     * Conventions: The exponent is positive and the factor is 1/N, i.e.,
+     *              output[j] := (1 / N) sum[ exp(+2 PI i j k / N) input[k] ].
+     */
+    Complex!F[] inverseFft(F = double, R)(R range) const
+        if (isRandomAccessRange!R && isComplexLike!(ElementType!R) && isFloatingPoint!F)
+    {
+        enforceSize(range);
+        Complex!F[] ret;
+        if (range.length == 0)
+        {
+            return ret;
+        }
+
+        // Don't waste time initializing the memory for ret.
+        ret = uninitializedArray!(Complex!F[])(range.length);
+
+        inverseFft(range, ret);
+        return ret;
+    }
+
+    /**
+     * Inverse FFT that allows a user-supplied buffer to be provided.  The buffer
+     * must be a random access range with slicing, and its elements
+     * must be some complex-like type.
+     */
+    void inverseFft(Ret, R)(R range, Ret buf) const
+        if (isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret)
+    {
+        enforceSize(range);
+
+        auto swapped = map!swapRealImag(range);
+        fft(swapped,  buf);
+
+        immutable lenNeg1 = 1.0 / buf.length;
+        foreach (ref elem; buf)
+        {
+            immutable temp = elem.re * lenNeg1;
+            elem.re = elem.im * lenNeg1;
+            elem.im = temp;
+        }
+    }
+}
+
+// This mixin creates an Fft object in the scope it's mixed into such that all
+// memory owned by the object is deterministically destroyed at the end of that
+// scope.
+private enum string MakeLocalFft = q{
+    import core.stdc.stdlib;
+    import core.exception : onOutOfMemoryError;
+
+    auto lookupBuf = (cast(lookup_t*) malloc(range.length * 2 * lookup_t.sizeof))
+                     [0 .. 2 * range.length];
+    if (!lookupBuf.ptr)
+        onOutOfMemoryError();
+
+    scope(exit) free(cast(void*) lookupBuf.ptr);
+    auto fftObj = scoped!Fft(lookupBuf);
+};
+
+/**Convenience functions that create an $(D Fft) object, run the FFT or inverse
+ * FFT and return the result.  Useful for one-off FFTs.
+ *
+ * Note:  In addition to convenience, these functions are slightly more
+ *        efficient than manually creating an Fft object for a single use,
+ *        as the Fft object is deterministically destroyed before these
+ *        functions return.
+ */
+Complex!F[] fft(F = double, R)(R range)
+{
+    mixin(MakeLocalFft);
+    return fftObj.fft!(F, R)(range);
+}
+
+/// ditto
+void fft(Ret, R)(R range, Ret buf)
+{
+    mixin(MakeLocalFft);
+    return fftObj.fft!(Ret, R)(range, buf);
+}
+
+/// ditto
+Complex!F[] inverseFft(F = double, R)(R range)
+{
+    mixin(MakeLocalFft);
+    return fftObj.inverseFft!(F, R)(range);
+}
+
+/// ditto
+void inverseFft(Ret, R)(R range, Ret buf)
+{
+    mixin(MakeLocalFft);
+    return fftObj.inverseFft!(Ret, R)(range, buf);
+}
+
+@system unittest
+{
+    import std.algorithm;
+    import std.conv;
+    import std.range;
+    // Test values from R and Octave.
+    auto arr = [1,2,3,4,5,6,7,8];
+    auto fft1 = fft(arr);
+    assert(approxEqual(map!"a.re"(fft1),
+        [36.0, -4, -4, -4, -4, -4, -4, -4]));
+    assert(approxEqual(map!"a.im"(fft1),
+        [0, 9.6568, 4, 1.6568, 0, -1.6568, -4, -9.6568]));
+
+    auto fft1Retro = fft(retro(arr));
+    assert(approxEqual(map!"a.re"(fft1Retro),
+        [36.0, 4, 4, 4, 4, 4, 4, 4]));
+    assert(approxEqual(map!"a.im"(fft1Retro),
+        [0, -9.6568, -4, -1.6568, 0, 1.6568, 4, 9.6568]));
+
+    auto fft1Float = fft(to!(float[])(arr));
+    assert(approxEqual(map!"a.re"(fft1), map!"a.re"(fft1Float)));
+    assert(approxEqual(map!"a.im"(fft1), map!"a.im"(fft1Float)));
+
+    alias C = Complex!float;
+    auto arr2 = [C(1,2), C(3,4), C(5,6), C(7,8), C(9,10),
+        C(11,12), C(13,14), C(15,16)];
+    auto fft2 = fft(arr2);
+    assert(approxEqual(map!"a.re"(fft2),
+        [64.0, -27.3137, -16, -11.3137, -8, -4.6862, 0, 11.3137]));
+    assert(approxEqual(map!"a.im"(fft2),
+        [72, 11.3137, 0, -4.686, -8, -11.3137, -16, -27.3137]));
+
+    auto inv1 = inverseFft(fft1);
+    assert(approxEqual(map!"a.re"(inv1), arr));
+    assert(reduce!max(map!"a.im"(inv1)) < 1e-10);
+
+    auto inv2 = inverseFft(fft2);
+    assert(approxEqual(map!"a.re"(inv2), map!"a.re"(arr2)));
+    assert(approxEqual(map!"a.im"(inv2), map!"a.im"(arr2)));
+
+    // FFTs of size 0, 1 and 2 are handled as special cases.  Test them here.
+    ushort[] empty;
+    assert(fft(empty) == null);
+    assert(inverseFft(fft(empty)) == null);
+
+    real[] oneElem = [4.5L];
+    auto oneFft = fft(oneElem);
+    assert(oneFft.length == 1);
+    assert(oneFft[0].re == 4.5L);
+    assert(oneFft[0].im == 0);
+
+    auto oneInv = inverseFft(oneFft);
+    assert(oneInv.length == 1);
+    assert(approxEqual(oneInv[0].re, 4.5));
+    assert(approxEqual(oneInv[0].im, 0));
+
+    long[2] twoElems = [8, 4];
+    auto twoFft = fft(twoElems[]);
+    assert(twoFft.length == 2);
+    assert(approxEqual(twoFft[0].re, 12));
+    assert(approxEqual(twoFft[0].im, 0));
+    assert(approxEqual(twoFft[1].re, 4));
+    assert(approxEqual(twoFft[1].im, 0));
+    auto twoInv = inverseFft(twoFft);
+    assert(approxEqual(twoInv[0].re, 8));
+    assert(approxEqual(twoInv[0].im, 0));
+    assert(approxEqual(twoInv[1].re, 4));
+    assert(approxEqual(twoInv[1].im, 0));
+}
+
+// Swaps the real and imaginary parts of a complex number.  This is useful
+// for inverse FFTs.
+C swapRealImag(C)(C input)
+{
+    return C(input.im, input.re);
+}
+
+private:
+// The reasons I couldn't use std.algorithm were b/c its stride length isn't
+// modifiable on the fly and because range has grown some performance hacks
+// for powers of 2.
+struct Stride(R)
+{
+    import core.bitop : bsf;
+    Unqual!R range;
+    size_t _nSteps;
+    size_t _length;
+    alias E = ElementType!(R);
+
+    this(R range, size_t nStepsIn)
+    {
+        this.range = range;
+       _nSteps = nStepsIn;
+       _length = (range.length + _nSteps - 1) / nSteps;
+    }
+
+    size_t length() const @property
+    {
+        return _length;
+    }
+
+    typeof(this) save() @property
+    {
+        auto ret = this;
+        ret.range = ret.range.save;
+        return ret;
+    }
+
+    E opIndex(size_t index)
+    {
+        return range[index * _nSteps];
+    }
+
+    E front() @property
+    {
+        return range[0];
+    }
+
+    void popFront()
+    {
+        if (range.length >= _nSteps)
+        {
+            range = range[_nSteps .. range.length];
+            _length--;
+        }
+        else
+        {
+            range = range[0 .. 0];
+            _length = 0;
+        }
+    }
+
+    // Pops half the range's stride.
+    void popHalf()
+    {
+        range = range[_nSteps / 2 .. range.length];
+    }
+
+    bool empty() const @property
+    {
+        return length == 0;
+    }
+
+    size_t nSteps() const @property
+    {
+        return _nSteps;
+    }
+
+    void doubleSteps()
+    {
+        _nSteps *= 2;
+        _length /= 2;
+    }
+
+    size_t nSteps(size_t newVal) @property
+    {
+        _nSteps = newVal;
+
+        // Using >> bsf(nSteps) is a few cycles faster than / nSteps.
+        _length = (range.length + _nSteps - 1)  >> bsf(nSteps);
+        return newVal;
+    }
+}
+
+// Hard-coded base case for FFT of size 2.  This is actually a TON faster than
+// using a generic slow DFT.  This seems to be the best base case.  (Size 1
+// can be coded inline as buf[0] = range[0]).
+void slowFourier2(Ret, R)(R range, Ret buf)
+{
+    assert(range.length == 2);
+    assert(buf.length == 2);
+    buf[0] = range[0] + range[1];
+    buf[1] = range[0] - range[1];
+}
+
+// Hard-coded base case for FFT of size 4.  Doesn't work as well as the size
+// 2 case.
+void slowFourier4(Ret, R)(R range, Ret buf)
+{
+    alias C = ElementType!Ret;
+
+    assert(range.length == 4);
+    assert(buf.length == 4);
+    buf[0] = range[0] + range[1] + range[2] + range[3];
+    buf[1] = range[0] - range[1] * C(0, 1) - range[2] + range[3] * C(0, 1);
+    buf[2] = range[0] - range[1] + range[2] - range[3];
+    buf[3] = range[0] + range[1] * C(0, 1) - range[2] - range[3] * C(0, 1);
+}
+
+N roundDownToPowerOf2(N)(N num)
+if (isScalarType!N && !isFloatingPoint!N)
+{
+    import core.bitop : bsr;
+    return num & (cast(N) 1 << bsr(num));
+}
+
+@safe unittest
+{
+    assert(roundDownToPowerOf2(7) == 4);
+    assert(roundDownToPowerOf2(4) == 4);
+}
+
+template isComplexLike(T)
+{
+    enum bool isComplexLike = is(typeof(T.init.re)) &&
+        is(typeof(T.init.im));
+}
+
+@safe unittest
+{
+    static assert(isComplexLike!(Complex!double));
+    static assert(!isComplexLike!(uint));
+}