[TableGen] Optimize intrinsic info type signature encoding (#106809)

Change the "fixed encoding" table used for encoding intrinsic type signature to use 16-bit encoding as opposed to 32-bit. This results in both space and time improvements. For space, the total static storage size (in bytes) of this info reduces by 50%: - Current = 14193*4 (Fixed table) + 16058 + 3 (Long Table) = 72833 - New size = 14193*2 (Fixed table) + 19879 + 3 (Long Table) = 48268. - Reduction = 50.9% For time, with the added benchmark, we see a 7.3% speedup in `GetIntrinsicInfoTableEntries` benchmark. Actual output of the benchmark in included in the GitHub MR.
author: Rahul Joshi <rjoshi@nvidia.com> 2024-09-04 14:46:48 -0700
committer: GitHub <noreply@github.com> 2024-09-04 14:46:48 -0700
commit: dcf0160bd61d150e7b94067fcd991b466a361b08 (patch)
tree: c4f1706420e268d68157e75f6a362a34928f97f3
parent: dd754cd262222bcb489038ac791e4278d90697f0 (diff)
download: llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.zip
llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.tar.gz
llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.tar.bz2
4 files changed, 87 insertions, 38 deletions
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index e3366e6..aa0cb77 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -6,3 +6,4 @@ add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
new file mode 100644
index 0000000..7f3bd3b
--- /dev/null
+++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
@@ -0,0 +1,30 @@
+//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace Intrinsic;
+
+static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) {
+  SmallVector<IITDescriptor> Table;
+  for (auto _ : state) {
+    for (ID ID = 1; ID < num_intrinsics; ++ID) {
+      // This makes sure the vector does not keep growing, as well as after the
+      // first iteration does not result in additional allocations.
+      Table.clear();
+      getIntrinsicInfoTableEntries(ID, Table);
+    }
+  }
+}
+
+BENCHMARK(BM_GetIntrinsicInfoTableEntries);
+
+BENCHMARK_MAIN();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 69520fd..afef893 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1381,22 +1381,24 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
 
 void Intrinsic::getIntrinsicInfoTableEntries(ID id,
                                              SmallVectorImpl<IITDescriptor> &T){
+  static_assert(sizeof(IIT_Table[0]) == 2,
+                "Expect 16-bit entries in IIT_Table");
   // Check to see if the intrinsic's type was expressible by the table.
-  unsigned TableVal = IIT_Table[id-1];
+  uint16_t TableVal = IIT_Table[id - 1];
 
   // Decode the TableVal into an array of IITValues.
-  SmallVector<unsigned char, 8> IITValues;
+  SmallVector<unsigned char> IITValues;
   ArrayRef<unsigned char> IITEntries;
   unsigned NextElt = 0;
-  if ((TableVal >> 31) != 0) {
+  if (TableVal >> 15) {
     // This is an offset into the IIT_LongEncodingTable.
     IITEntries = IIT_LongEncodingTable;
 
     // Strip sentinel bit.
-    NextElt = (TableVal << 1) >> 1;
+    NextElt = TableVal & 0x7fff;
   } else {
-    // Decode the TableVal into an array of IITValues.  If the entry was encoded
-    // into a single word in the table itself, decode it now.
+    // If the entry was encoded into a single word in the table itself, decode
+    // it from an array of nibbles to an array of bytes.
     do {
       IITValues.push_back(TableVal & 0xF);
       TableVal >>= 4;
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 09eb1ed..0f4d7bf 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -282,11 +282,37 @@ static TypeSigTy ComputeTypeSignature(const CodeGenIntrinsic &Int) {
   return TypeSig;
 }
 
+// Pack the type signature into 32-bit fixed encoding word.
+static std::optional<uint32_t> encodePacked(const TypeSigTy &TypeSig) {
+  if (TypeSig.size() > 8)
+    return std::nullopt;
+
+  uint32_t Result = 0;
+  for (unsigned char C : reverse(TypeSig)) {
+    if (C > 15)
+      return std::nullopt;
+    Result = (Result << 4) | C;
+  }
+  return Result;
+}
+
 void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
                                      raw_ostream &OS) {
-  // If we can compute a 32-bit fixed encoding for this intrinsic, do so and
+  // Note: the code below can be switched to use 32-bit fixed encoding by
+  // flipping the flag below.
+  constexpr bool Use16BitFixedEncoding = true;
+  using FixedEncodingTy =
+      std::conditional_t<Use16BitFixedEncoding, uint16_t, uint32_t>;
+  constexpr unsigned FixedEncodingBits = sizeof(FixedEncodingTy) * CHAR_BIT;
+  // Mask with all bits 1 except the most significant bit.
+  const unsigned Mask = (1U << (FixedEncodingBits - 1)) - 1;
+  const unsigned MSBPostion = FixedEncodingBits - 1;
+  StringRef FixedEncodingTypeName =
+      Use16BitFixedEncoding ? "uint16_t" : "uint32_t";
+
+  // If we can compute a 16/32-bit fixed encoding for this intrinsic, do so and
   // capture it in this vector, otherwise store a ~0U.
-  std::vector<unsigned> FixedEncodings;
+  std::vector<FixedEncodingTy> FixedEncodings;
   SequenceToOffsetTable<TypeSigTy> LongEncodingTable;
 
   FixedEncodings.reserve(Ints.size());
@@ -296,69 +322,59 @@ void IntrinsicEmitter::EmitGenerator(const CodeGenIntrinsicTable &Ints,
     // Get the signature for the intrinsic.
     TypeSigTy TypeSig = ComputeTypeSignature(Int);
 
-    // Check to see if we can encode it into a 32-bit word. We can only encode
-    // 8 nibbles into a 32-bit word.
-    if (TypeSig.size() <= 8) {
-      // Attempt to pack elements of TypeSig into a 32-bit word, starting from
-      // the most significant nibble.
-      unsigned Result = 0;
-      bool Failed = false;
-      for (unsigned char C : reverse(TypeSig)) {
-        if (C > 15) {
-          Failed = true;
-          break;
-        }
-        Result = (Result << 4) | C;
-      }
-
-      // If this could be encoded into a 31-bit word, return it.
-      if (!Failed && (Result >> 31) == 0) {
-        FixedEncodings.push_back(Result);
-        continue;
-      }
+    // Check to see if we can encode it into a 16/32 bit word.
+    std::optional<uint32_t> Result = encodePacked(TypeSig);
+    if (Result && (*Result & Mask) == Result) {
+      FixedEncodings.push_back(static_cast<FixedEncodingTy>(*Result));
+      continue;
     }
 
-    // Otherwise, we're going to unique the sequence into the
-    // LongEncodingTable, and use its offset in the 32-bit table instead.
     LongEncodingTable.add(TypeSig);
 
     // This is a placehold that we'll replace after the table is laid out.
-    FixedEncodings.push_back(~0U);
+    FixedEncodings.push_back(static_cast<FixedEncodingTy>(~0U));
   }
 
   LongEncodingTable.layout();
 
-  OS << R"(// Global intrinsic function declaration type table.
+  OS << formatv(R"(// Global intrinsic function declaration type table.
 #ifdef GET_INTRINSIC_GENERATOR_GLOBAL
-static constexpr unsigned IIT_Table[] = {
-  )";
+static constexpr {0} IIT_Table[] = {{
+  )",
+                FixedEncodingTypeName);
 
+  unsigned MaxOffset = 0;
   for (auto [Idx, FixedEncoding, Int] : enumerate(FixedEncodings, Ints)) {
     if ((Idx & 7) == 7)
       OS << "\n  ";
 
     // If the entry fit in the table, just emit it.
-    if (FixedEncoding != ~0U) {
+    if ((FixedEncoding & Mask) == FixedEncoding) {
       OS << "0x" << Twine::utohexstr(FixedEncoding) << ", ";
       continue;
     }
 
     TypeSigTy TypeSig = ComputeTypeSignature(Int);
+    unsigned Offset = LongEncodingTable.get(TypeSig);
+    MaxOffset = std::max(MaxOffset, Offset);
 
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
-    OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
+    OS << formatv("(1U<<{0}) | {1}, ", MSBPostion, Offset);
   }
 
   OS << "0\n};\n\n";
 
+  // verify that all offsets will fit in 16/32 bits.
+  if ((MaxOffset & Mask) != MaxOffset)
+    PrintFatalError("Offset of long encoding table exceeds encoding bits");
+
   // Emit the shared table of register lists.
   OS << "static constexpr unsigned char IIT_LongEncodingTable[] = {\n";
   if (!LongEncodingTable.empty())
     LongEncodingTable.emit(
         OS, [](raw_ostream &OS, unsigned char C) { OS << (unsigned)C; });
-  OS << "  255\n};\n\n";
-
+  OS << "  255\n};\n";
   OS << "#endif\n\n"; // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
author	Rahul Joshi <rjoshi@nvidia.com>	2024-09-04 14:46:48 -0700
committer	GitHub <noreply@github.com>	2024-09-04 14:46:48 -0700
commit	dcf0160bd61d150e7b94067fcd991b466a361b08 (patch)
tree	c4f1706420e268d68157e75f6a362a34928f97f3
parent	dd754cd262222bcb489038ac791e4278d90697f0 (diff)
download	llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.zip llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.tar.gz llvm-dcf0160bd61d150e7b94067fcd991b466a361b08.tar.bz2