#!/usr/bin/env python3

import argparse
import functools
import os
import pathlib
import re
import statistics
import subprocess
import sys
import tempfile

import plotly
import tqdm

@functools.total_ordering
class Commit:
    """
    This class represents a commit inside a given Git repository.
    """

    def __init__(self, git_repo, sha):
        self._git_repo = git_repo
        self._sha = sha

    def __eq__(self, other):
        """
        Return whether two commits refer to the same commit.

        This doesn't take into account the content of the Git tree at those commits, only the
        'identity' of the commits themselves.
        """
        return self.fullrev == other.fullrev

    def __lt__(self, other):
        """
        Return whether a commit is an ancestor of another commit in the Git repository.
        """
        # Is self._sha an ancestor of other._sha?
        res = subprocess.run(['git', '-C', self._git_repo, 'merge-base', '--is-ancestor', self._sha, other._sha])
        if res.returncode not in (0, 1):
            raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
        return res.returncode == 0

    def show(self, include_diff=False):
        """
        Return the commit information equivalent to `git show` associated to this commit.
        """
        cmd = ['git', '-C', self._git_repo, 'show', self._sha]
        if not include_diff:
            cmd.append('--no-patch')
        return subprocess.check_output(cmd, text=True)

    @functools.cached_property
    def shortrev(self):
        """
        Return the shortened version of the given SHA.
        """
        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', '--short', self._sha], text=True).strip()

    @functools.cached_property
    def fullrev(self):
        """
        Return the full SHA associated to this commit.
        """
        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip()

    def prefetch(self):
        """
        Prefetch cached properties associated to this commit object.

        This makes it possible to control when time is spent recovering that information from Git for
        e.g. better reporting to the user.
        """
        self.shortrev
        self.fullrev

    def __str__(self):
        return self._sha

def truncate_lines(string, n, marker=None):
    """
    Truncate the given string at a certain number of lines.

    Optionally, add a marker on the last line to identify that truncation has happened.
    """
    lines = string.splitlines()
    truncated = lines[:n]
    if marker is not None and len(lines) > len(truncated):
        truncated[-1] = marker
    assert len(truncated) <= n, "broken post-condition"
    return '\n'.join(truncated)

def create_plot(commits, benchmarks, data):
    """
    Create a plot object showing the evolution of each benchmark throughout the given commits.
    """
    figure = plotly.graph_objects.Figure(layout_title_text=f'{commits[0].shortrev} to {commits[-1].shortrev}')

    # Create the X axis and the hover information
    x_axis = [commit.shortrev for commit in commits]
    hover_info = [truncate_lines(commit.show(), 30, marker='...').replace('\n', '<br>') for commit in commits]

    # For each benchmark, get the metric for that benchmark for each commit.
    #
    # Some commits may not have any data associated to a benchmark (e.g. runtime or compilation error).
    # Use None, which is handled properly by plotly.
    for benchmark in benchmarks:
        series = [commit_data.get(benchmark, None) for commit_data in data]
        scatter = plotly.graph_objects.Scatter(x=x_axis, y=series, text=hover_info, name=benchmark)
        figure.add_trace(scatter)

    return figure

def directory_path(string):
    if os.path.isdir(string):
        return pathlib.Path(string)
    else:
        raise NotADirectoryError(string)

def parse_lnt(lines):
    """
    Parse lines in LNT format and return a dictionnary of the form:

        {
            'benchmark1': {
                'metric1': [float],
                'metric2': [float],
                ...
            },
            'benchmark2': {
                'metric1': [float],
                'metric2': [float],
                ...
            },
            ...
        }

    Each metric may have multiple values.
    """
    results = {}
    for line in lines:
        line = line.strip()
        if not line:
            continue

        (identifier, value) = line.split(' ')
        (name, metric) = identifier.split('.')
        if name not in results:
            results[name] = {}
        if metric not in results[name]:
            results[name][metric] = []
        results[name][metric].append(float(value))
    return results

def main(argv):
    parser = argparse.ArgumentParser(
        prog='visualize-historical',
        description='Visualize historical data in LNT format. This program generates a HTML file that embeds an '
                    'interactive plot with the provided data. The HTML file can then be opened in a browser to '
                    'visualize the data as a chart.',
        epilog='This script depends on the `plotly` and the `tqdm` Python modules.')
    parser.add_argument('directory', type=directory_path,
        help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
             'This is also the format generated by the `benchmark-historical` utility.')
    parser.add_argument('--output', '-o', type=pathlib.Path, required=False,
        help='Optional path where to output the resulting HTML file. If it already exists, it is overwritten. '
             'Defaults to a temporary file which is opened automatically once generated, but not removed after '
             'creation.')
    parser.add_argument('--metric', type=str, default='execution_time',
        help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- '
             'this option allows selecting which metric is being visualized. The default is "execution_time".')
    parser.add_argument('--filter', type=str, required=False,
        help='An optional regular expression used to filter the benchmarks included in the chart. '
             'Only benchmarks whose names match the regular expression will be included. '
             'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
             'and to then filter them in the browser, but in some cases producing a chart with a reduced '
             'number of data series is useful.')
    parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
        help='Path to the git repository to use for ordering commits in time. '
             'By default, the current working directory is used.')
    parser.add_argument('--open', action='store_true',
        help='Whether to automatically open the generated HTML file when finished. If no output file is provided, '
             'the resulting benchmark is opened automatically by default.')
    args = parser.parse_args(argv)

    # Extract benchmark data from the directory and keep only the metric we're interested in.
    #
    # Some data points may have multiple values associated to the metric (e.g. if we performed
    # multiple runs to reduce noise), in which case we aggregate them using a median.
    historical_data = []
    files = [f for f in args.directory.glob('*.lnt')]
    for file in tqdm.tqdm(files, desc='Parsing LNT files'):
        (commit, _) = os.path.splitext(os.path.basename(file))
        commit = Commit(args.git_repo, commit)
        with open(file, 'r') as f:
            lnt_data = parse_lnt(f.readlines())
            commit_data = {}
            for (bm, metrics) in lnt_data.items():
                commit_data[bm] = statistics.median(metrics[args.metric]) if args.metric in metrics else None
        historical_data.append((commit, commit_data))

    # Obtain commit information which is then cached throughout the program. Do this
    # eagerly so we can provide a progress bar.
    for (commit, _) in tqdm.tqdm(historical_data, desc='Prefetching Git information'):
        commit.prefetch()

    # Sort the data based on the ordering of commits inside the provided Git repository
    historical_data.sort(key=lambda x: x[0])

    # Filter the benchmarks if needed
    benchmarks = {b for (_, commit_data) in historical_data for b in commit_data.keys()}
    if args.filter is not None:
        regex = re.compile(args.filter)
        benchmarks = {b for b in benchmarks if regex.search(b)}

    # Plot the data for all the required benchmarks
    figure = create_plot([commit for (commit, _) in historical_data],
                         sorted(list(benchmarks)),
                         [data for (_, data) in historical_data])
    do_open = args.output is None or args.open
    output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
    plotly.io.write_html(figure, file=output, auto_open=do_open)

if __name__ == '__main__':
    main(sys.argv[1:])