#!/usr/bin/env python3 import argparse import functools import os import pathlib import re import statistics import subprocess import sys import tempfile import plotly import tqdm @functools.total_ordering class Commit: """ This class represents a commit inside a given Git repository. """ def __init__(self, git_repo, sha): self._git_repo = git_repo self._sha = sha def __eq__(self, other): """ Return whether two commits refer to the same commit. This doesn't take into account the content of the Git tree at those commits, only the 'identity' of the commits themselves. """ return self.fullrev == other.fullrev def __lt__(self, other): """ Return whether a commit is an ancestor of another commit in the Git repository. """ # Is self._sha an ancestor of other._sha? res = subprocess.run(['git', '-C', self._git_repo, 'merge-base', '--is-ancestor', self._sha, other._sha]) if res.returncode not in (0, 1): raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}') return res.returncode == 0 def show(self, include_diff=False): """ Return the commit information equivalent to `git show` associated to this commit. """ cmd = ['git', '-C', self._git_repo, 'show', self._sha] if not include_diff: cmd.append('--no-patch') return subprocess.check_output(cmd, text=True) @functools.cached_property def shortrev(self): """ Return the shortened version of the given SHA. """ return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', '--short', self._sha], text=True).strip() @functools.cached_property def fullrev(self): """ Return the full SHA associated to this commit. """ return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip() def prefetch(self): """ Prefetch cached properties associated to this commit object. This makes it possible to control when time is spent recovering that information from Git for e.g. better reporting to the user. """ self.shortrev self.fullrev def __str__(self): return self._sha def truncate_lines(string, n, marker=None): """ Truncate the given string at a certain number of lines. Optionally, add a marker on the last line to identify that truncation has happened. """ lines = string.splitlines() truncated = lines[:n] if marker is not None and len(lines) > len(truncated): truncated[-1] = marker assert len(truncated) <= n, "broken post-condition" return '\n'.join(truncated) def create_plot(commits, benchmarks, data): """ Create a plot object showing the evolution of each benchmark throughout the given commits. """ figure = plotly.graph_objects.Figure(layout_title_text=f'{commits[0].shortrev} to {commits[-1].shortrev}') # Create the X axis and the hover information x_axis = [commit.shortrev for commit in commits] hover_info = [truncate_lines(commit.show(), 30, marker='...').replace('\n', '
') for commit in commits] # For each benchmark, get the metric for that benchmark for each commit. # # Some commits may not have any data associated to a benchmark (e.g. runtime or compilation error). # Use None, which is handled properly by plotly. for benchmark in benchmarks: series = [commit_data.get(benchmark, None) for commit_data in data] scatter = plotly.graph_objects.Scatter(x=x_axis, y=series, text=hover_info, name=benchmark) figure.add_trace(scatter) return figure def directory_path(string): if os.path.isdir(string): return pathlib.Path(string) else: raise NotADirectoryError(string) def parse_lnt(lines): """ Parse lines in LNT format and return a dictionnary of the form: { 'benchmark1': { 'metric1': [float], 'metric2': [float], ... }, 'benchmark2': { 'metric1': [float], 'metric2': [float], ... }, ... } Each metric may have multiple values. """ results = {} for line in lines: line = line.strip() if not line: continue (identifier, value) = line.split(' ') (name, metric) = identifier.split('.') if name not in results: results[name] = {} if metric not in results[name]: results[name][metric] = [] results[name][metric].append(float(value)) return results def main(argv): parser = argparse.ArgumentParser( prog='visualize-historical', description='Visualize historical data in LNT format. This program generates a HTML file that embeds an ' 'interactive plot with the provided data. The HTML file can then be opened in a browser to ' 'visualize the data as a chart.', epilog='This script depends on the `plotly` and the `tqdm` Python modules.') parser.add_argument('directory', type=directory_path, help='Path to a valid directory containing benchmark data in LNT format, each file being named .lnt. ' 'This is also the format generated by the `benchmark-historical` utility.') parser.add_argument('--output', '-o', type=pathlib.Path, required=False, help='Optional path where to output the resulting HTML file. If it already exists, it is overwritten. ' 'Defaults to a temporary file which is opened automatically once generated, but not removed after ' 'creation.') parser.add_argument('--metric', type=str, default='execution_time', help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- ' 'this option allows selecting which metric is being visualized. The default is "execution_time".') parser.add_argument('--filter', type=str, required=False, help='An optional regular expression used to filter the benchmarks included in the chart. ' 'Only benchmarks whose names match the regular expression will be included. ' 'Since the chart is interactive, it generally makes most sense to include all the benchmarks ' 'and to then filter them in the browser, but in some cases producing a chart with a reduced ' 'number of data series is useful.') parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()), help='Path to the git repository to use for ordering commits in time. ' 'By default, the current working directory is used.') parser.add_argument('--open', action='store_true', help='Whether to automatically open the generated HTML file when finished. If no output file is provided, ' 'the resulting benchmark is opened automatically by default.') args = parser.parse_args(argv) # Extract benchmark data from the directory and keep only the metric we're interested in. # # Some data points may have multiple values associated to the metric (e.g. if we performed # multiple runs to reduce noise), in which case we aggregate them using a median. historical_data = [] files = [f for f in args.directory.glob('*.lnt')] for file in tqdm.tqdm(files, desc='Parsing LNT files'): (commit, _) = os.path.splitext(os.path.basename(file)) commit = Commit(args.git_repo, commit) with open(file, 'r') as f: lnt_data = parse_lnt(f.readlines()) commit_data = {} for (bm, metrics) in lnt_data.items(): commit_data[bm] = statistics.median(metrics[args.metric]) if args.metric in metrics else None historical_data.append((commit, commit_data)) # Obtain commit information which is then cached throughout the program. Do this # eagerly so we can provide a progress bar. for (commit, _) in tqdm.tqdm(historical_data, desc='Prefetching Git information'): commit.prefetch() # Sort the data based on the ordering of commits inside the provided Git repository historical_data.sort(key=lambda x: x[0]) # Filter the benchmarks if needed benchmarks = {b for (_, commit_data) in historical_data for b in commit_data.keys()} if args.filter is not None: regex = re.compile(args.filter) benchmarks = {b for b in benchmarks if regex.search(b)} # Plot the data for all the required benchmarks figure = create_plot([commit for (commit, _) in historical_data], sorted(list(benchmarks)), [data for (_, data) in historical_data]) do_open = args.output is None or args.open output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name plotly.io.write_html(figure, file=output, auto_open=do_open) if __name__ == '__main__': main(sys.argv[1:])