diff options
author | Nicolai Hähnle <nicolai.haehnle@amd.com> | 2024-10-04 14:51:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-04 14:51:12 +0200 |
commit | 02debcef12793b5bf926a41a24ede18dc50eb18b (patch) | |
tree | 2042b0b7f6839605b11bbf2598db83ea9cf3b773 /llvm/utils/UpdateTestChecks/common.py | |
parent | 04540fac5bfa6c1630e84ccdc7f817bd8bc1a986 (diff) | |
download | llvm-02debcef12793b5bf926a41a24ede18dc50eb18b.zip llvm-02debcef12793b5bf926a41a24ede18dc50eb18b.tar.gz llvm-02debcef12793b5bf926a41a24ede18dc50eb18b.tar.bz2 |
update_test_checks: improve IR value name stability (#110940)
By default, UTC attempts to keep the produced diff small by keeping IR
value name variables stable. The old algorithm was roughly:
1. Compute a diff between the old and new check lines, where
"uncommitted" variable names are replaced by a wildcard.
This leads to a set of non-crossing "candidate" pairs of
(old line, new line) that we can try to make equal.
2. Greedily walk this list of candidates, committing to variable names
that make candidate lines equal if possible.
The greedy approach in the second step has the downside that committing
to a variable name greedily can sometimes prevent many subsequent
candidates from getting the variable name assignment that would make
them equal.
We keep the first step as-is, but replace the second one by an algorithm
that finds a large independent set of candidates, i.e. candidate pairs
of (old line, new line) which are non-conflicting in the sense that
their desired variable name mappings are not in conflict.
We find the large independent set by greedily assigning a coloring to
the conflict graph and taking the largest color class. We then commit to
all the variable name mappings which are desired by candidates in this
largest color class.
As before, we then recurse into regions between matching lines. This is
required in large cases. For example, running this algorithm at the
top-level of the new test case (stable_ir_values5.ll) matches up most of
the instructions, but not the names of the result values of all the
`load`s. This is because (unlike e.g. the getelementptrs) the load
instructions are all equal except for variable names, and so step 1 (the
diff algorithm) doesn't consider them as candidates. However, they are
trivially matched by recursion.
This also happens to fix a bug in tracking line indices that went
unnoticed previously...
As is usually the case with these changes, the quality improvement is
hard to see from the diff of this patch. However, it becomes obvious
when
comparing the diff of stable_ir_values5.ll against
stable_ir_value5.ll.expected
before and after this change.
Diffstat (limited to 'llvm/utils/UpdateTestChecks/common.py')
-rw-r--r-- | llvm/utils/UpdateTestChecks/common.py | 130 |
1 files changed, 103 insertions, 27 deletions
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index b861bd0..759fc5d 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -1534,14 +1534,58 @@ def remap_metavar_names( candidate_matches = find_diff_matching(lhs_lines, rhs_lines) - # Apply commits greedily on a match-by-match basis - matches = [(-1, -1)] - committed_anything = False + candidate_matches = [ + (old_begin + lhs_idx, new_begin + rhs_idx) + for lhs_idx, rhs_idx in candidate_matches + ] + + # Candidate matches may conflict if they require conflicting mappings of + # names. We want to determine a large set of compatible candidates, + # because that leads to a small diff. + # + # We think of the candidates as vertices in a conflict graph. The + # conflict graph has edges between incompatible candidates. We want to + # find a large independent set in this graph. + # + # Greedily selecting candidates and removing incompatible ones has the + # disadvantage that making few bad decisions early on can have huge + # consequences. + # + # Instead, we implicitly compute multiple independent sets by greedily + # assigning a *coloring* to the conflict graph. Then, we select the + # largest color class (which is the largest independent set we found), + # commit to all candidates in it, and recurse. + # + # Note that we don't actually materialize the conflict graph. Instead, + # each color class tracks the information needed to decide implicitly + # whether a vertex conflicts (has an edge to) any of the vertices added + # to the color class so far. + class Color: + def __init__(self): + # (lhs_idx, rhs_idx) of matches in this color + self.matches = [] + + # rhs_name -> lhs_name mappings required by this color + self.mapping = {} + + # lhs_names committed for this color + self.committed = set() + + colors = [] + for lhs_idx, rhs_idx in candidate_matches: lhs_line = old_line_infos[lhs_idx] rhs_line = new_line_infos[rhs_idx] - local_commits = {} + # We scan through the uncommitted names in the candidate line and + # filter out the color classes to which the candidate could be + # assigned. + # + # Simultaneously, we prepare a new color class in case the candidate + # conflicts with all colors that have been established so far. + compatible_colors = colors[:] + new_color = Color() + new_color.matches.append((lhs_idx, rhs_idx)) for lhs_value, rhs_value in zip(lhs_line.values, rhs_line.values): if new_mapping[rhs_value.name] in committed_names: @@ -1554,39 +1598,71 @@ def remap_metavar_names( else: break - if rhs_value.name in local_commits: + if rhs_value.name in new_color.mapping: # Same, but for a possible commit happening on the same line - if local_commits[rhs_value.name] == lhs_value.name: + if new_color.color[rhs_value.name] == lhs_value.name: continue else: break - if lhs_value.name in committed_names: - # We can't map this value because the name we would map it to has already been - # committed for something else. Give up on this line. + if ( + lhs_value.name in committed_names + or lhs_value.name in new_color.committed + ): + # We can't map this value because the name we would map it + # to has already been committed for something else. Give up + # on this line. break - local_commits[rhs_value.name] = lhs_value.name - else: - # No reason not to add any commitments for this line - for rhs_var, lhs_var in local_commits.items(): - new_mapping[rhs_var] = lhs_var - committed_names.add(lhs_var) - committed_anything = True - - if ( - lhs_var != rhs_var - and lhs_var in new_mapping - and new_mapping[lhs_var] == lhs_var - ): - new_mapping[lhs_var] = "conflict_" + lhs_var + new_color.mapping[rhs_value.name] = lhs_value.name + new_color.committed.add(lhs_value.name) - matches.append((lhs_idx, rhs_idx)) + color_idx = 0 + while color_idx < len(compatible_colors): + color = compatible_colors[color_idx] + compatible = True + if rhs_value.name in color.mapping: + compatible = color.mapping[rhs_value.name] == lhs_value.name + else: + compatible = lhs_value.name not in color.committed + if compatible: + color_idx += 1 + else: + del compatible_colors[color_idx] + else: + # We never broke out of the loop, which means that at a minimum, + # this line is viable standalone + if compatible_colors: + color = max(compatible_colors, key=lambda color: len(color.matches)) + color.mapping.update(new_color.mapping) + color.committed.update(new_color.committed) + color.matches.append((lhs_idx, rhs_idx)) + else: + colors.append(new_color) + + if colors: + # Pick the largest color class. This gives us a large independent + # (non-conflicting) set of candidate matches. Assign all names + # required by the independent set and recurse. + max_color = max(colors, key=lambda color: len(color.matches)) + + for rhs_var, lhs_var in max_color.mapping.items(): + new_mapping[rhs_var] = lhs_var + committed_names.add(lhs_var) + + if ( + lhs_var != rhs_var + and lhs_var in new_mapping + and new_mapping[lhs_var] == lhs_var + ): + new_mapping[lhs_var] = "conflict_" + lhs_var - matches.append((old_end, new_end)) + matches = ( + [(old_begin - 1, new_begin - 1)] + + max_color.matches + + [(old_end, new_end)] + ) - # Recursively handle sequences between matches - if committed_anything: for (lhs_prev, rhs_prev), (lhs_next, rhs_next) in zip(matches, matches[1:]): recurse(lhs_prev + 1, lhs_next, rhs_prev + 1, rhs_next) |