3 files changed, 286 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 624e6f0..6d295e7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2015-06-01  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
+	* benchtests/scripts/compare_bench.py: New file.
+	* benchtests/scripts/import_bench.py (mean): New function.
+	(split_list): Likewise.
+	(do_for_all_timings): Likewise.
+	(compress_timings): Likewise.
+
 	* benchtests/scripts/import_bench.py: New file.
 	* benchtests/scripts/validate_benchout.py: Import import_bench
 	instead of jsonschema.
diff --git a/benchtests/scripts/compare_bench.py b/benchtests/scripts/compare_bench.py
new file mode 100755
index 0000000..be5b5ca
--- /dev/null
+++ b/benchtests/scripts/compare_bench.py
@@ -0,0 +1,184 @@
+#!/usr/bin/python
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+"""Compare two benchmark results
+
+Given two benchmark result files and a threshold, this script compares the
+benchmark results and flags differences in performance beyond a given
+threshold.
+"""
+import sys
+import os
+import pylab
+import import_bench as bench
+
+def do_compare(func, var, tl1, tl2, par, threshold):
+    """Compare one of the aggregate measurements
+
+    Helper function to compare one of the aggregate measurements of a function
+    variant.
+
+    Args:
+        func: Function name
+        var: Function variant name
+        tl1: The first timings list
+        tl2: The second timings list
+        par: The aggregate to measure
+        threshold: The threshold for differences, beyond which the script should
+        print a warning.
+    """
+    d = abs(tl2[par] - tl1[par]) * 100 / tl1[str(par)]
+    if d > threshold:
+        if tl1[par] > tl2[par]:
+            ind = '+++'
+        else:
+            ind = '---'
+        print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' %
+                (ind, func, var, par, d, tl1[par], tl2[par]))
+
+
+def compare_runs(pts1, pts2, threshold):
+    """Compare two benchmark runs
+
+    Args:
+        pts1: Timing data from first machine
+        pts2: Timing data from second machine
+    """
+
+    # XXX We assume that the two benchmarks have identical functions and
+    # variants.  We cannot compare two benchmarks that may have different
+    # functions or variants.  Maybe that is something for the future.
+    for func in pts1['functions'].keys():
+        for var in pts1['functions'][func].keys():
+            tl1 = pts1['functions'][func][var]
+            tl2 = pts2['functions'][func][var]
+
+            # Compare the consolidated numbers
+            # do_compare(func, var, tl1, tl2, 'max', threshold)
+            do_compare(func, var, tl1, tl2, 'min', threshold)
+            do_compare(func, var, tl1, tl2, 'mean', threshold)
+
+            # Skip over to the next variant or function if there is no detailed
+            # timing info for the function variant.
+            if 'timings' not in pts1['functions'][func][var].keys() or \
+                'timings' not in pts2['functions'][func][var].keys():
+                    return
+
+            # If two lists do not have the same length then it is likely that
+            # the performance characteristics of the function have changed.
+            # XXX: It is also likely that there was some measurement that
+            # strayed outside the usual range.  Such ouiers should not
+            # happen on an idle machine with identical hardware and
+            # configuration, but ideal environments are hard to come by.
+            if len(tl1['timings']) != len(tl2['timings']):
+                print('* %s(%s): Timing characteristics changed' %
+                        (func, var))
+                print('\tBefore: [%s]' %
+                        ', '.join([str(x) for x in tl1['timings']]))
+                print('\tAfter: [%s]' %
+                        ', '.join([str(x) for x in tl2['timings']]))
+                continue
+
+            # Collect numbers whose differences cross the threshold we have
+            # set.
+            issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \
+                        if abs(y - x) * 100 / x > threshold]
+
+            # Now print them.
+            for t1, t2 in issues:
+                d = abs(t2 - t1) * 100 / t1
+                if t2 > t1:
+                    ind = '-'
+                else:
+                    ind = '+'
+
+                print("%s %s(%s): (%.2lf%%) from %g to %g" %
+                        (ind, func, var, d, t1, t2))
+
+
+def plot_graphs(bench1, bench2):
+    """Plot graphs for functions
+
+    Make scatter plots for the functions and their variants.
+
+    Args:
+        bench1: Set of points from the first machine
+        bench2: Set of points from the second machine.
+    """
+    for func in bench1['functions'].keys():
+        for var in bench1['functions'][func].keys():
+            # No point trying to print a graph if there are no detailed
+            # timings.
+            if u'timings' not in bench1['functions'][func][var].keys():
+                print('Skipping graph for %s(%s)' % (func, var))
+                continue
+
+            pylab.clf()
+            pylab.ylabel('Time (cycles)')
+
+            # First set of points
+            length = len(bench1['functions'][func][var]['timings'])
+            X = [float(x) for x in range(length)]
+            lines = pylab.scatter(X, bench1['functions'][func][var]['timings'],
+                    1.5 + 100 / length)
+            pylab.setp(lines, 'color', 'r')
+
+            # Second set of points
+            length = len(bench2['functions'][func][var]['timings'])
+            X = [float(x) for x in range(length)]
+            lines = pylab.scatter(X, bench2['functions'][func][var]['timings'],
+                    1.5 + 100 / length)
+            pylab.setp(lines, 'color', 'g')
+
+            if var:
+                filename = "%s-%s.png" % (func, var)
+            else:
+                filename = "%s.png" % func
+            print('Writing out %s' % filename)
+            pylab.savefig(filename)
+
+
+def main(args):
+    """Program Entry Point
+
+    Take two benchmark output files and compare their timings.
+    """
+    if len(args) > 4 or len(args) < 3:
+        print('Usage: %s <schema> <file1> <file2> [threshold in %%]' % sys.argv[0])
+        sys.exit(os.EX_USAGE)
+
+    bench1 = bench.parse_bench(args[1], args[0])
+    bench2 = bench.parse_bench(args[2], args[0])
+    if len(args) == 4:
+        threshold = float(args[3])
+    else:
+        threshold = 10.0
+
+    if (bench1['timing_type'] != bench2['timing_type']):
+        print('Cannot compare benchmark outputs: timing types are different')
+        return
+
+    plot_graphs(bench1, bench2)
+
+    bench.compress_timings(bench1)
+    bench.compress_timings(bench2)
+
+    compare_runs(bench1, bench2, threshold)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
index 81248c2..d37ff62 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -25,6 +25,102 @@ except ImportError:
     raise
 
 
+def mean(lst):
+    """Compute and return mean of numbers in a list
+
+    The numpy average function has horrible performance, so implement our
+    own mean function.
+
+    Args:
+        lst: The list of numbers to average.
+    Return:
+        The mean of members in the list.
+    """
+    return sum(lst) / len(lst)
+
+
+def split_list(bench, func, var):
+    """ Split the list into a smaller set of more distinct points
+
+    Group together points such that the difference between the smallest
+    point and the mean is less than 1/3rd of the mean.  This means that
+    the mean is at most 1.5x the smallest member of that group.
+
+    mean - xmin < mean / 3
+    i.e. 2 * mean / 3 < xmin
+    i.e. mean < 3 * xmin / 2
+
+    For an evenly distributed group, the largest member will be less than
+    twice the smallest member of the group.
+    Derivation:
+
+    An evenly distributed series would be xmin, xmin + d, xmin + 2d...
+
+    mean = (2 * n * xmin + n * (n - 1) * d) / 2 * n
+    and max element is xmin + (n - 1) * d
+
+    Now, mean < 3 * xmin / 2
+
+    3 * xmin > 2 * mean
+    3 * xmin > (2 * n * xmin + n * (n - 1) * d) / n
+    3 * n * xmin > 2 * n * xmin + n * (n - 1) * d
+    n * xmin > n * (n - 1) * d
+    xmin > (n - 1) * d
+    2 * xmin > xmin + (n-1) * d
+    2 * xmin > xmax
+
+    Hence, proved.
+
+    Similarly, it is trivial to prove that for a similar aggregation by using
+    the maximum element, the maximum element in the group must be at most 4/3
+    times the mean.
+
+    Args:
+        bench: The benchmark object
+        func: The function name
+        var: The function variant name
+    """
+    means = []
+    lst = bench['functions'][func][var]['timings']
+    last = len(lst) - 1
+    while lst:
+        for i in range(last + 1):
+            avg = mean(lst[i:])
+            if avg > 0.75 * lst[last]:
+                means.insert(0, avg)
+                lst = lst[:i]
+                last = i - 1
+                break
+    bench['functions'][func][var]['timings'] = means
+
+
+def do_for_all_timings(bench, callback):
+    """Call a function for all timing objects for each function and its
+    variants.
+
+    Args:
+        bench: The benchmark object
+        callback: The callback function
+    """
+    for func in bench['functions'].keys():
+        for k in bench['functions'][func].keys():
+            if 'timings' not in bench['functions'][func][k].keys():
+                continue
+
+            callback(bench, func, k)
+
+
+def compress_timings(points):
+    """Club points with close enough values into a single mean value
+
+    See split_list for details on how the clubbing is done.
+
+    Args:
+        points: The set of points.
+    """
+    do_for_all_timings(points, split_list)
+
+
 def parse_bench(filename, schema_filename):
     """Parse the input file