4 files changed, 239 insertions, 43 deletions
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 22dbdaa..18332d2 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_unittest(CodeGenTests
   MachineInstrBundleIteratorTest.cpp
   MachineInstrTest.cpp
   MachineOperandTest.cpp
+  MIR2VecTest.cpp
   RegAllocScoreTest.cpp
   PassManagerTest.cpp
   ScalableVectorMVTsTest.cpp
diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp
new file mode 100644
index 0000000..01f2ead
--- /dev/null
+++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp
@@ -0,0 +1,195 @@
+//===- MIR2VecTest.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace mir2vec;
+using VocabMap = std::map<std::string, ir2vec::Embedding>;
+
+namespace {
+
+TEST(MIR2VecTest, RegexExtraction) {
+  // Test simple instruction names
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("NOP"), "NOP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("RET"), "RET");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD16ri"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD32rr"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD64rm"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("MOV8ri"), "MOV");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("MOV32mr"), "MOV");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("PUSH64r"), "PUSH");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("POP64r"), "POP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("JMP_4"), "JMP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("CALL64pcrel32"), "CALL");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("SOME_INSTR_123"),
+            "SOME_INSTR");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("123ADD"), "ADD");
+  EXPECT_FALSE(MIRVocabulary::extractBaseOpcodeName("123").empty());
+}
+
+class MIR2VecVocabTestFixture : public ::testing::Test {
+protected:
+  std::unique_ptr<LLVMContext> Ctx;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<TargetMachine> TM;
+  const TargetInstrInfo *TII;
+
+  void SetUp() override {
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86TargetMC();
+
+    Ctx = std::make_unique<LLVMContext>();
+    M = std::make_unique<Module>("test", *Ctx);
+
+    // Set up X86 target
+    Triple TargetTriple("x86_64-unknown-linux-gnu");
+    M->setTargetTriple(TargetTriple);
+
+    std::string Error;
+    const Target *TheTarget =
+        TargetRegistry::lookupTarget(M->getTargetTriple(), Error);
+    ASSERT_TRUE(TheTarget) << "Failed to lookup target: " << Error;
+
+    TargetOptions Options;
+    TM = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
+        M->getTargetTriple(), "", "", Options, Reloc::Model::Static));
+    ASSERT_TRUE(TM);
+
+    // Create a dummy function to get subtarget info
+    FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F =
+        Function::Create(FT, Function::ExternalLinkage, "test", M.get());
+
+    // Get the target instruction info
+    TII = TM->getSubtargetImpl(*F)->getInstrInfo();
+    ASSERT_TRUE(TII);
+  }
+};
+
+TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
+  // Test that same base opcodes get same canonical indices
+  std::string BaseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri");
+  std::string BaseName2 = MIRVocabulary::extractBaseOpcodeName("ADD32rr");
+  std::string BaseName3 = MIRVocabulary::extractBaseOpcodeName("ADD64rm");
+
+  EXPECT_EQ(BaseName1, BaseName2);
+  EXPECT_EQ(BaseName2, BaseName3);
+
+  // Create a MIRVocabulary instance to test the mapping
+  // Use a minimal MIRVocabulary to trigger canonical mapping construction
+  VocabMap VM;
+  Embedding Val = Embedding(64, 1.0f);
+  VM["ADD"] = Val;
+  MIRVocabulary TestVocab(std::move(VM), TII);
+
+  unsigned Index1 = TestVocab.getCanonicalIndexForBaseName(BaseName1);
+  unsigned Index2 = TestVocab.getCanonicalIndexForBaseName(BaseName2);
+  unsigned Index3 = TestVocab.getCanonicalIndexForBaseName(BaseName3);
+  EXPECT_EQ(Index1, Index2);
+  EXPECT_EQ(Index2, Index3);
+
+  // Test that different base opcodes get different canonical indices
+  std::string AddBase = MIRVocabulary::extractBaseOpcodeName("ADD32rr");
+  std::string SubBase = MIRVocabulary::extractBaseOpcodeName("SUB32rr");
+  std::string MovBase = MIRVocabulary::extractBaseOpcodeName("MOV32rr");
+
+  unsigned AddIndex = TestVocab.getCanonicalIndexForBaseName(AddBase);
+  unsigned SubIndex = TestVocab.getCanonicalIndexForBaseName(SubBase);
+  unsigned MovIndex = TestVocab.getCanonicalIndexForBaseName(MovBase);
+
+  EXPECT_NE(AddIndex, SubIndex);
+  EXPECT_NE(SubIndex, MovIndex);
+  EXPECT_NE(AddIndex, MovIndex);
+
+  // Even though we only added "ADD" to the vocab, the canonical mapping
+  // should assign unique indices to all the base opcodes of the target
+  // Ideally, we would check against the exact number of unique base opcodes
+  // for X86, but that would make the test brittle. So we just check that
+  // the number is reasonably closer to the expected number (>6880) and not just
+  // opcodes that we added.
+  EXPECT_GT(TestVocab.getCanonicalSize(),
+            6880u); // X86 has >6880 unique base opcodes
+
+  // Check that the embeddings for opcodes not in the vocab are zero vectors
+  EXPECT_TRUE(TestVocab[AddIndex].approximatelyEquals(Val));
+  EXPECT_TRUE(TestVocab[SubIndex].approximatelyEquals(Embedding(64, 0.0f)));
+  EXPECT_TRUE(TestVocab[MovIndex].approximatelyEquals(Embedding(64, 0.0f)));
+}
+
+// Test deterministic mapping
+TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
+  // Test that the same base name always maps to the same canonical index
+  std::string BaseName = "ADD";
+
+  // Create a MIRVocabulary instance to test deterministic mapping
+  // Use a minimal MIRVocabulary to trigger canonical mapping construction
+  VocabMap VM;
+  VM["ADD"] = Embedding(64, 1.0f);
+  MIRVocabulary TestVocab(std::move(VM), TII);
+
+  unsigned Index1 = TestVocab.getCanonicalIndexForBaseName(BaseName);
+  unsigned Index2 = TestVocab.getCanonicalIndexForBaseName(BaseName);
+  unsigned Index3 = TestVocab.getCanonicalIndexForBaseName(BaseName);
+
+  EXPECT_EQ(Index1, Index2);
+  EXPECT_EQ(Index2, Index3);
+
+  // Test across multiple runs
+  for (int Pos = 0; Pos < 100; ++Pos) {
+    unsigned Index = TestVocab.getCanonicalIndexForBaseName(BaseName);
+    EXPECT_EQ(Index, Index1);
+  }
+}
+
+// Test MIRVocabulary construction
+TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
+  // Test empty MIRVocabulary
+  MIRVocabulary EmptyVocab;
+  EXPECT_FALSE(EmptyVocab.isValid());
+
+  // Test MIRVocabulary with embeddings via VocabMap
+  VocabMap VM;
+  VM["ADD"] = Embedding(128, 1.0f); // Dimension 128, all values 1.0
+  VM["SUB"] = Embedding(128, 2.0f); // Dimension 128, all values 2.0
+
+  MIRVocabulary Vocab(std::move(VM), TII);
+  EXPECT_TRUE(Vocab.isValid());
+  EXPECT_EQ(Vocab.getDimension(), 128u);
+
+  // Test iterator - iterates over individual embeddings
+  auto IT = Vocab.begin();
+  EXPECT_NE(IT, Vocab.end());
+
+  // Check first embedding exists and has correct dimension
+  EXPECT_EQ((*IT).size(), 128u);
+
+  size_t Count = 0;
+  for (auto IT = Vocab.begin(); IT != Vocab.end(); ++IT) {
+    EXPECT_EQ((*IT).size(), 128u);
+    ++Count;
+  }
+  EXPECT_GT(Count, 0u);
+}
+
+} // namespace
+\ No newline at end of file
diff --git a/llvm/unittests/IR/FunctionTest.cpp b/llvm/unittests/IR/FunctionTest.cpp
index 7ba7584..8ed7699 100644
--- a/llvm/unittests/IR/FunctionTest.cpp
+++ b/llvm/unittests/IR/FunctionTest.cpp
@@ -625,4 +625,23 @@ TEST(FunctionTest, Personality) {
   EXPECT_FALSE(LLVMHasPersonalityFn(wrap(F)));
 }
 
+TEST(FunctionTest, LLVMGetOrInsertFunction) {
+  LLVMContext Ctx;
+  Module M("test", Ctx);
+  Type *Int8Ty = Type::getInt8Ty(Ctx);
+  FunctionType *FTy = FunctionType::get(Int8Ty, false);
+
+  // Create the function using the C API
+  LLVMValueRef FuncRef = LLVMGetOrInsertFunction(wrap(&M), "F", 1, wrap(FTy));
+
+  // Verify that the returned value is a function and has the correct type
+  Function *Func = unwrap<Function>(FuncRef);
+  EXPECT_EQ(Func->getName(), "F");
+  EXPECT_EQ(Func->getFunctionType(), FTy);
+
+  // Call LLVMGetOrInsertFunction again to ensure it returns the same function
+  LLVMValueRef FuncRef2 = LLVMGetOrInsertFunction(wrap(&M), "F", 1, wrap(FTy));
+  EXPECT_EQ(FuncRef, FuncRef2);
+}
+
 } // end namespace
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 21d4596..fe9e7e8 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -606,82 +606,63 @@ TEST(InstructionTest, ConstrainedTrans) {
 
 TEST(InstructionsTest, isEliminableCastPair) {
   LLVMContext C;
+  DataLayout DL1("p1:32:32");
 
-  Type* Int16Ty = Type::getInt16Ty(C);
-  Type* Int32Ty = Type::getInt32Ty(C);
-  Type* Int64Ty = Type::getInt64Ty(C);
-  Type *Int64PtrTy = PointerType::get(C, 0);
+  Type *Int16Ty = Type::getInt16Ty(C);
+  Type *Int64Ty = Type::getInt64Ty(C);
+  Type *PtrTy64 = PointerType::get(C, 0);
+  Type *PtrTy32 = PointerType::get(C, 1);
 
   // Source and destination pointers have same size -> bitcast.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
-                                           CastInst::IntToPtr,
-                                           Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           Int32Ty, nullptr, Int32Ty),
-            CastInst::BitCast);
-
-  // Source and destination have unknown sizes, but the same address space and
-  // the intermediate int is the maximum pointer size -> bitcast
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
-                                           CastInst::IntToPtr,
-                                           Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           nullptr, nullptr, nullptr),
+                                           CastInst::IntToPtr, PtrTy32, Int64Ty,
+                                           PtrTy32, &DL1),
             CastInst::BitCast);
 
-  // Source and destination have unknown sizes, but the same address space and
-  // the intermediate int is not the maximum pointer size -> nothing
+  // Source and destination have unknown sizes.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
-                                           CastInst::IntToPtr,
-                                           Int64PtrTy, Int32Ty, Int64PtrTy,
-                                           nullptr, nullptr, nullptr),
+                                           CastInst::IntToPtr, PtrTy32, Int64Ty,
+                                           PtrTy32, nullptr),
             0U);
 
   // Middle pointer big enough -> bitcast.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::PtrToInt,
-                                           Int64Ty, Int64PtrTy, Int64Ty,
-                                           nullptr, Int64Ty, nullptr),
+                                           CastInst::PtrToInt, Int64Ty, PtrTy64,
+                                           Int64Ty, &DL1),
             CastInst::BitCast);
 
   // Middle pointer too small -> fail.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::PtrToInt,
-                                           Int64Ty, Int64PtrTy, Int64Ty,
-                                           nullptr, Int32Ty, nullptr),
+                                           CastInst::PtrToInt, Int64Ty, PtrTy32,
+                                           Int64Ty, &DL1),
             0U);
 
   // Test that we don't eliminate bitcasts between different address spaces,
   // or if we don't have available pointer size information.
-  DataLayout DL("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
-                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64"
-                "-v128:128:128-a:0:64-s:64:64-f80:128:128-n8:16:32:64-S128");
+  DataLayout DL2("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
+                 "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64"
+                 "-v128:128:128-a:0:64-s:64:64-f80:128:128-n8:16:32:64-S128");
 
   Type *Int64PtrTyAS1 = PointerType::get(C, 1);
   Type *Int64PtrTyAS2 = PointerType::get(C, 2);
 
-  IntegerType *Int16SizePtr = DL.getIntPtrType(C, 1);
-  IntegerType *Int64SizePtr = DL.getIntPtrType(C, 2);
-
   // Cannot simplify inttoptr, addrspacecast
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::AddrSpaceCast,
-                                           Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-                                           nullptr, Int16SizePtr, Int64SizePtr),
+                                           CastInst::AddrSpaceCast, Int16Ty,
+                                           Int64PtrTyAS1, Int64PtrTyAS2, &DL2),
             0U);
 
   // Cannot simplify addrspacecast, ptrtoint
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::AddrSpaceCast,
-                                           CastInst::PtrToInt,
-                                           Int64PtrTyAS1, Int64PtrTyAS2, Int16Ty,
-                                           Int64SizePtr, Int16SizePtr, nullptr),
+                                           CastInst::PtrToInt, Int64PtrTyAS1,
+                                           Int64PtrTyAS2, Int16Ty, &DL2),
             0U);
 
   // Pass since the bitcast address spaces are the same
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::BitCast,
-                                           Int16Ty, Int64PtrTyAS1, Int64PtrTyAS1,
-                                           nullptr, nullptr, nullptr),
+  EXPECT_EQ(CastInst::isEliminableCastPair(
+                CastInst::IntToPtr, CastInst::BitCast, Int16Ty, Int64PtrTyAS1,
+                Int64PtrTyAS1, nullptr),
             CastInst::IntToPtr);
-
 }
 
 TEST(InstructionsTest, CloneCall) {