aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
diff options
context:
space:
mode:
authorRahul Joshi <rjoshi@nvidia.com>2025-09-01 13:44:18 -0700
committerGitHub <noreply@github.com>2025-09-01 13:44:18 -0700
commitdafffe262d6d1114fa83ec155241aad4e7793845 (patch)
treeb375f4458f33ad8db8a1c13338c4a4b9fa146f26 /llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
parent33d5a3b455d3bb0d0487dabb98728aeaa8cba03b (diff)
downloadllvm-dafffe262d6d1114fa83ec155241aad4e7793845.zip
llvm-dafffe262d6d1114fa83ec155241aad4e7793845.tar.gz
llvm-dafffe262d6d1114fa83ec155241aad4e7793845.tar.bz2
[LLVM][MC][DecoderEmitter] Add support to specialize decoder per bitwidth (#154865)
This change adds an option to specialize decoders per bitwidth, which can help reduce the (compiled) code size of the decoder code. **Current state**: Currently, the code generated by the decoder emitter consists of two key functions: `decodeInstruction` which is the entry point into the generated code and `decodeToMCInst` which is invoked when a decode op is reached while traversing through the decoder table. Both functions are templated on `InsnType` which is the raw instruction bits that are supplied to `decodeInstruction`. Several backends call `decodeInstruction` with different `InsnType` types, leading to several template instantiations of these functions in the final code. As an example, AMDGPU instantiates this function with type `DecoderUInt128` type for decoding 96/128-bit instructions, `uint64_t` for decoding 64-bit instructions, and `uint32_t` for decoding 32-bit instructions. Since there is just one `decodeToMCInst` in the generated code, it has code that handles decoding for *all* instruction sizes. However, the decoders emitted for different instructions sizes rarely have any intersection with each other. That means, in the AMDGPU case, the instantiation with InsnType == DecoderUInt128 has decoder code for 32/64-bit instructions that is *never exercised*. Conversely, the instantiation with InsnType == uint64_t has decoder code for 128/96/32-bit instructions that is never exercised. This leads to unnecessary dead code in the generated disassembler binary (that the compiler cannot eliminate by itself). **New state**: With this change, we introduce an option `specialize-decoders-per-bitwidth`. Under this mode, the DecoderEmitter will generate several versions of `decodeToMCInst` function, one for each bitwidth. The code is still templated, but will require backends to specify, for each `InsnType` used, the bitwidth of the instruction that the type is used to represent using a type-trait `InsnBitWidth`. This will enable the templated code to choose the right variant of `decodeToMCInst`. Under this mode, a particular instantiation will only end up instantiating a single variant of `decodeToMCInst` generated and that will include only those decoders that are applicable to a single bitwidth, resulting in elimination of the code duplication through instantiation and a reduction in code size. Additionally, under this mode, decoders are uniqued only within a given bitwidth (as opposed to across all bitwidths without this option), so the decoder index values assigned are smaller, and consume less bytes in their ULEB128 encoding. As a result, the generated decoder tables can also reduce in size. Adopt this feature for the AMDGPU and RISCV backend. In a release build, this results in a net 55% reduction in the .text size of libLLVMAMDGPUDisassembler.so and a 5% reduction in the .rodata size. For RISCV, which today uses a single `uint64_t` type, this results in a 3.7% increase in code size (expected as we instantiate the code 3 times now). Actual measured sizes are as follows: ``` Baseline commit: 72c04bb882ad70230bce309c3013d9cc2c99e9a7 Configuration: Ubuntu clang version 18.1.3, release build with asserts disabled. AMDGPU Before After Change ====================================================== .text 612327 275607 55% reduction .rodata 369728 351336 5% reduction RISCV: ====================================================== .text 47407 49187 3.7% increase .rodata 35768 35839 0.1% increase ```
Diffstat (limited to 'llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h')
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h38
1 files changed, 0 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164b..ded447b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
class MCSubtargetInfo;
class Twine;
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
- uint64_t Lo = 0;
- uint64_t Hi = 0;
-
-public:
- DecoderUInt128() = default;
- DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
- operator bool() const { return Lo || Hi; }
- uint64_t extractBitsAsZExtValue(unsigned NumBits,
- unsigned BitPosition) const {
- assert(NumBits && NumBits <= 64);
- assert(BitPosition < 128);
- uint64_t Val;
- if (BitPosition < 64)
- Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
- else
- Val = Hi >> (BitPosition - 64);
- return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
- }
- DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
- return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
- }
- DecoderUInt128 operator&(const uint64_t &RHS) const {
- return *this & DecoderUInt128(RHS);
- }
- DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
- bool operator==(const DecoderUInt128 &RHS) {
- return Lo == RHS.Lo && Hi == RHS.Hi;
- }
- bool operator!=(const DecoderUInt128 &RHS) {
- return Lo != RHS.Lo || Hi != RHS.Hi;
- }
- bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
//===----------------------------------------------------------------------===//
// AMDGPUDisassembler
//===----------------------------------------------------------------------===//