[MLGO] Add ability to extract IR from bazel using aquery (#96964)

This patch adds in support for extracting IR from binaries built with bazel through querying the linker command line using bazel aquery.
author: Aiden Grossman <aidengrossman@google.com> 2024-06-27 14:28:02 -0700
committer: GitHub <noreply@github.com> 2024-06-27 14:28:02 -0700
commit: 4134b33c6a362cb462b335177d6d9e8235f04309 (patch)
tree: e1e16294212d78641859852e673aa5c49f88a0b8
parent: 6b55ec1198ce9356340372fd8233b907d8d4cea2 (diff)
download: llvm-4134b33c6a362cb462b335177d6d9e8235f04309.zip
llvm-4134b33c6a362cb462b335177d6d9e8235f04309.tar.gz
llvm-4134b33c6a362cb462b335177d6d9e8235f04309.tar.bz2
3 files changed, 67 insertions, 2 deletions
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 9441543..a7d52da 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -45,8 +45,8 @@ def parse_args_and_run():
     parser.add_argument(
         "--input_type",
         type=str,
-        help="Input file type - JSON, LLD params, or directory.",
-        choices=["json", "params", "directory"],
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "bazel_aquery"],
         default="json",
         nargs="?",
     )
@@ -149,6 +149,11 @@ def main(args):
             "structured compilation database, use that instead"
         )
         objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "bazel_aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
     else:
         logging.error("Unknown input type: %s", args.input_type)
 
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 8e9779c..f434e59 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -316,6 +316,29 @@ def load_for_lld_thinlto(
     return [make_spec(path) for path in paths]
 
 
+def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
+    """Creates an object file array by looking at the JSON output of bazel aquery.
+
+    Args:
+      aquery_json: The JSON-formatted output of the bazel aquery command for
+        the target of interest. The bazel aquery JSON should be a JSON
+        serialized version of the analysis.ActionGraphContainer proto.
+        https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
+      obj_base_dir: The base build directory that all object files will be
+        written out as arelative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    linker_params = []
+
+    for action_info in aquery_json["actions"]:
+        if action_info["mnemonic"] != "CppLink":
+            continue
+        linker_params = action_info["arguments"]
+
+    return load_from_lld_params(linker_params, obj_base_dir, output_dir)
+
+
 def run_extraction(
     objs: List[TrainingIRExtractor],
     num_workers: int,
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index 653c572..f056446 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -304,6 +304,43 @@ def test_lld_thinlto_extraction(outer, outdir):
     # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
 
 
+## Test that we can load a bazel query JSON as expected.
+
+# RUN: %python %s test_load_bazel_aquery | FileCheck %s --check-prefix CHECK-TEST-LOAD-BAZEL-AQUERY
+
+
+def test_load_bazel_aquery():
+    obj = extract_ir_lib.load_bazel_aquery(
+        {
+            "actions": [
+                {"mnemonic": "not-link", "arguments": []},
+                {
+                    "mnemonic": "CppLink",
+                    "arguments": ["clang", "-o", "output_binary", "test1.o", "test2.o"],
+                },
+            ]
+        },
+        "/some/path",
+        "/tmp/out",
+    )
+    print(obj[0].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test1.o
+    print(obj[0].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test1.o
+    print(obj[0].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.cmd
+    print(obj[0].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.bc
+    print(obj[1].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test2.o
+    print(obj[1].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test2.o
+    print(obj[1].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.cmd
+    print(obj[1].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.bc
+
+
 ## Test that filtering works correctly
 
 # RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING
author	Aiden Grossman <aidengrossman@google.com>	2024-06-27 14:28:02 -0700
committer	GitHub <noreply@github.com>	2024-06-27 14:28:02 -0700
commit	4134b33c6a362cb462b335177d6d9e8235f04309 (patch)
tree	e1e16294212d78641859852e673aa5c49f88a0b8
parent	6b55ec1198ce9356340372fd8233b907d8d4cea2 (diff)
download	llvm-4134b33c6a362cb462b335177d6d9e8235f04309.zip llvm-4134b33c6a362cb462b335177d6d9e8235f04309.tar.gz llvm-4134b33c6a362cb462b335177d6d9e8235f04309.tar.bz2