From dbea538c4391caa8a369c0ccf720367f042185b1 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 5 Jan 2024 18:51:02 -0800 Subject: [DWARFLinker] Support MD5 checksums in the line table (#77151) Add support to the DWARF linkers for emitting DWARF 5 MD5 checksum in the line table. --- llvm/lib/DWARFLinker/DWARFStreamer.cpp | 24 +++++++++++++++------- .../DWARFLinkerParallel/DebugLineSectionEmitter.h | 13 +++++++++++- llvm/lib/DWARFLinkerParallel/OutputSections.cpp | 4 ++++ llvm/lib/DWARFLinkerParallel/OutputSections.h | 2 ++ .../ARM/dwarf5-dwarf4-combination-macho.test | 12 +++++------ llvm/test/tools/dsymutil/ARM/dwarf5-macho.test | 9 ++++---- 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp index faa9136..3ec082f 100644 --- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp +++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp @@ -859,10 +859,8 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( for (auto Include : P.IncludeDirectories) emitLineTableString(P, Include, DebugStrPool, DebugLineStrPool); - bool InlineSources = any_of(P.FileNames, [](auto &File) { - auto s = dwarf::toString(File.Source); - return s && !**s; - }); + bool HasChecksums = P.ContentTypes.HasMD5; + bool HasInlineSources = P.ContentTypes.HasSource; if (P.FileNames.empty()) { // file_name_entry_format_count (ubyte). @@ -870,7 +868,7 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( LineSectionSize += 1; } else { // file_name_entry_format_count (ubyte). - MS->emitInt8(2 + (InlineSources ? 1 : 0)); + MS->emitInt8(2 + (HasChecksums ? 1 : 0) + (HasInlineSources ? 1 : 0)); LineSectionSize += 1; // file_name_entry_format (sequence of ULEB128 pairs). @@ -880,7 +878,13 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_directory_index); LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_data1); - if (InlineSources) { + + if (HasChecksums) { + LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_MD5); + LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_data16); + } + + if (HasInlineSources) { LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_LLVM_source); LineSectionSize += MS->emitULEB128IntValue(StrForm); } @@ -894,7 +898,13 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( emitLineTableString(P, File.Name, DebugStrPool, DebugLineStrPool); MS->emitInt8(File.DirIdx); LineSectionSize += 1; - if (InlineSources) + if (HasChecksums) { + MS->emitBinaryData( + StringRef(reinterpret_cast(File.Checksum.data()), + File.Checksum.size())); + LineSectionSize += File.Checksum.size(); + } + if (HasInlineSources) emitLineTableString(P, File.Source, DebugStrPool, DebugLineStrPool); } } diff --git a/llvm/lib/DWARFLinkerParallel/DebugLineSectionEmitter.h b/llvm/lib/DWARFLinkerParallel/DebugLineSectionEmitter.h index fc7f8cb..27c63fa 100644 --- a/llvm/lib/DWARFLinkerParallel/DebugLineSectionEmitter.h +++ b/llvm/lib/DWARFLinkerParallel/DebugLineSectionEmitter.h @@ -197,7 +197,7 @@ private: Section.emitIntVal(0, 1); } else { // file_name_entry_format_count (ubyte). - Section.emitIntVal(2, 1); + Section.emitIntVal(2 + (P.ContentTypes.HasMD5 ? 1 : 0), 1); // file_name_entry_format (sequence of ULEB128 pairs). encodeULEB128(dwarf::DW_LNCT_path, Section.OS); @@ -205,6 +205,11 @@ private: encodeULEB128(dwarf::DW_LNCT_directory_index, Section.OS); encodeULEB128(dwarf::DW_FORM_data1, Section.OS); + + if (P.ContentTypes.HasMD5) { + encodeULEB128(dwarf::DW_LNCT_MD5, Section.OS); + encodeULEB128(dwarf::DW_FORM_data16, Section.OS); + } } // file_names_count (ULEB128). @@ -222,6 +227,12 @@ private: // source file. Section.emitString(File.Name.getForm(), *FileNameStr); Section.emitIntVal(File.DirIdx, 1); + + if (P.ContentTypes.HasMD5) { + Section.emitBinaryData( + StringRef(reinterpret_cast(File.Checksum.data()), + File.Checksum.size())); + } } } diff --git a/llvm/lib/DWARFLinkerParallel/OutputSections.cpp b/llvm/lib/DWARFLinkerParallel/OutputSections.cpp index 9c3e3eb..730ae0f 100644 --- a/llvm/lib/DWARFLinkerParallel/OutputSections.cpp +++ b/llvm/lib/DWARFLinkerParallel/OutputSections.cpp @@ -227,6 +227,10 @@ void SectionDescriptor::emitIntVal(uint64_t Val, unsigned Size) { } } +void SectionDescriptor::emitBinaryData(llvm::StringRef Data) { + OS.write(Data.data(), Data.size()); +} + void SectionDescriptor::apply(uint64_t PatchOffset, dwarf::Form AttrForm, uint64_t Val) { switch (AttrForm) { diff --git a/llvm/lib/DWARFLinkerParallel/OutputSections.h b/llvm/lib/DWARFLinkerParallel/OutputSections.h index f23b2efb..0f394b0 100644 --- a/llvm/lib/DWARFLinkerParallel/OutputSections.h +++ b/llvm/lib/DWARFLinkerParallel/OutputSections.h @@ -283,6 +283,8 @@ struct SectionDescriptor { void emitString(dwarf::Form StringForm, const char *StringVal); + void emitBinaryData(llvm::StringRef Data); + /// Emit specified inplace string value into the current section contents. void emitInplaceString(StringRef String) { OS << GlobalData.translateString(String); diff --git a/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test b/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test index 0199bf2..d5b78bd 100644 --- a/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test +++ b/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test @@ -73,7 +73,7 @@ CHECK-NEXT: DW_AT_low_pc [DW_FORM_addrx] (indexed (00000000) address = 0x[[ CHECK: DW_AT_linkage_name [DW_FORM_strx] (indexed (00000005) string = "_Z4foo2i") CHECK: DW_AT_name [DW_FORM_strx] (indexed (00000006) string = "foo2") CHECK: DW_TAG_formal_parameter -CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOCLIST_OFFSET:[0-9a-f]+]]: +CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOCLIST_OFFSET:[0-9a-f]+]]: CHECK-NEXT: [0x[[#%.16x,LOCLIST_PAIR_START:]], 0x[[#%.16x,LOCLIST_PAIR_END:]]): [[LOCLIST_EXPR:.*]] CHECK-NEXT: [0x[[#%.16x,LOCLIST_PAIR_START2:]], 0x[[#%.16x,LOCLIST_PAIR_END2:]]): [[LOCLIST_EXPR2:.*]]) CHECK: DW_AT_name [DW_FORM_strx] (indexed (00000007) string = "a") @@ -93,7 +93,7 @@ CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x[[#%.16x,LOC_LOWPC CHECK: DW_AT_linkage_name [DW_FORM_strp] ( .debug_str[0x000000e6] = "_Z3bari") CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x000000ee] = "bar") CHECK: DW_TAG_formal_parameter -CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOC_OFFSET:[0-9a-f]+]]: +CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOC_OFFSET:[0-9a-f]+]]: CHECK-NEXT: [0x[[#%.16x,LOC_PAIR_START:]], 0x[[#%.16x,LOC_PAIR_END:]]): [[LOC_EXPR:.*]] CHECK-NEXT: [0x[[#%.16x,LOC_PAIR_START2:]], 0x[[#%.16x,LOC_PAIR_END2:]]): [[LOC_EXPR2:.*]]) CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x000000f2] = "x") @@ -105,7 +105,7 @@ CHECK-NEXT: (0x[[#sub(LOC_PAIR_START2,LOC_LOWPC)]], 0x[[#sub(LOC_PAIR CHECK: .debug_loclists contents: CHECK-NEXT: 0x00000000: locations list header: length = 0x00000018, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000 -CHECK-NEXT: 0x[[LOCLIST_OFFSET]]: +CHECK-NEXT: 0x[[LOCLIST_OFFSET]]: CHECK-NEXT: DW_LLE_base_addressx (0x0000000000000000) CHECK-NEXT: DW_LLE_offset_pair (0x[[#sub(LOCLIST_PAIR_START,LOCLIST_LOWPC)]], 0x[[#sub(LOCLIST_PAIR_END,LOCLIST_LOWPC)]]) CHECK-NEXT: DW_LLE_offset_pair (0x[[#sub(LOCLIST_PAIR_START2,LOCLIST_LOWPC)]], 0x[[#sub(LOCLIST_PAIR_END2,LOCLIST_LOWPC)]]) @@ -114,12 +114,12 @@ CHECK-NEXT: DW_LLE_end_of_list () CHECK: .debug_line contents: CHECK-NEXT: debug_line[0x00000000] CHECK-NEXT: Line table prologue: -CHECK-NEXT: total_length: 0x00000048 +CHECK-NEXT: total_length: 0x0000005a CHECK-NEXT: format: DWARF32 CHECK-NEXT: version: 5 CHECK-NEXT: address_size: 8 CHECK-NEXT: seg_select_size: 0 -CHECK-NEXT: prologue_length: 0x00000025 +CHECK-NEXT: prologue_length: 0x00000037 CHECK-NEXT: min_inst_length: 1 CHECK-NEXT: max_ops_per_inst: 1 CHECK-NEXT: default_is_stmt: 1 @@ -143,7 +143,7 @@ CHECK-NEXT: file_names[ 0]: CHECK-NEXT: name: .debug_line_str[0x00000029] = "a.cpp" CHECK-NEXT: dir_index: 0 -CHECK: debug_line[0x0000004c] +CHECK: debug_line[0x0000005e] CHECK-NEXT: Line table prologue: CHECK-NEXT: total_length: 0x0000003b CHECK-NEXT: format: DWARF32 diff --git a/llvm/test/tools/dsymutil/ARM/dwarf5-macho.test b/llvm/test/tools/dsymutil/ARM/dwarf5-macho.test index 13409b2..f6d42a1 100644 --- a/llvm/test/tools/dsymutil/ARM/dwarf5-macho.test +++ b/llvm/test/tools/dsymutil/ARM/dwarf5-macho.test @@ -49,13 +49,13 @@ CHECK-NEXT: DW_AT_addr_base [DW_FORM_sec_offset] (0x00000008) CHECK: DW_TAG_subprogram CHECK-NEXT: DW_AT_low_pc [DW_FORM_addrx] (indexed (00000000) address = 0x[[#%.16x,LOCLIST_LOWPC:]]) CHECK: DW_TAG_formal_parameter -CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOC_OFFSET:[0-9a-f]+]]: +CHECK-NEXT: DW_AT_location [DW_FORM_sec_offset] (0x[[LOC_OFFSET:[0-9a-f]+]]: CHECK-NEXT: [0x[[#%.16x,LOCLIST_PAIR_START:]], 0x[[#%.16x,LOCLIST_PAIR_END:]]): [[LOCLIST_EXPR:.*]] CHECK-NEXT: [0x[[#%.16x,LOCLIST_PAIR_START2:]], 0x[[#%.16x,LOCLIST_PAIR_END2:]]): [[LOCLIST_EXPR2:.*]]) CHECK: .debug_loclists contents: CHECK-NEXT: 0x00000000: locations list header: length = 0x00000018, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000 -CHECK-NEXT: 0x[[LOC_OFFSET]]: +CHECK-NEXT: 0x[[LOC_OFFSET]]: CHECK-NEXT: DW_LLE_base_addressx (0x0000000000000000) CHECK-NEXT: DW_LLE_offset_pair (0x[[#sub(LOCLIST_PAIR_START,LOCLIST_LOWPC)]], 0x[[#sub(LOCLIST_PAIR_END,LOCLIST_LOWPC)]]) CHECK-NEXT: DW_LLE_offset_pair (0x[[#sub(LOCLIST_PAIR_START2,LOCLIST_LOWPC)]], 0x[[#sub(LOCLIST_PAIR_END2,LOCLIST_LOWPC)]]) @@ -64,12 +64,12 @@ CHECK-NEXT: DW_LLE_end_of_list () CHECK: .debug_line contents: CHECK-NEXT: debug_line[0x00000000] CHECK-NEXT: Line table prologue: -CHECK-NEXT: total_length: 0x00000048 +CHECK-NEXT: total_length: 0x0000005a CHECK-NEXT: format: DWARF32 CHECK-NEXT: version: 5 CHECK-NEXT: address_size: 8 CHECK-NEXT: seg_select_size: 0 -CHECK-NEXT: prologue_length: 0x00000025 +CHECK-NEXT: prologue_length: 0x00000037 CHECK-NEXT: min_inst_length: 1 CHECK-NEXT: max_ops_per_inst: 1 CHECK-NEXT: default_is_stmt: 1 @@ -92,6 +92,7 @@ CHECK-NEXT: include_directories[ 0] = .debug_line_str[0x00000000] = "/Users/sh CHECK-NEXT: file_names[ 0]: CHECK-NEXT: name: .debug_line_str[0x00000029] = "a.cpp" CHECK-NEXT: dir_index: 0 +CHECK-NEXT: md5_checksum: 2675ab7ce3623b564cfd8a2906a462e5 CHECK: .debug_str contents: -- cgit v1.1 From 2873060f3cfbd92dcff8d1037a08e9fb60f7882e Mon Sep 17 00:00:00 2001 From: Micah Weston Date: Fri, 5 Jan 2024 21:59:51 -0500 Subject: [SHT_LLVM_BB_ADDR_MAP] Fixes two bugs in decoding of PGOAnalyses in BBAddrMap. (#77139) We had specified that `readBBAddrMap` will always keep PGOAnalyses and BBAddrMaps the same length on success. https://github.com/llvm/llvm-project/blob/365fbbfbcfefb8766f7716109b9c3767b58e6058/llvm/include/llvm/Object/ELFObjectFile.h#L116-L117 It turns out that this is not currently the case when no analyses exist in a function. No test had caught it. We also should not append PGOBBEntries when there is no BBFreq or BrProb. This patch adds: * tests that PGOAnalyses and BBAddrMaps are same length even when no analyses are enabled * fixes decode so that PGOAnalyses and BBAddrMaps are same length * updates test to not emit unnecessary PGOBBEntries * fixes decode to not emit PGOBBEntries when unnecessary --- llvm/include/llvm/Object/ELFTypes.h | 3 +++ llvm/lib/Object/ELF.cpp | 7 +++--- llvm/unittests/Object/ELFObjectFileTest.cpp | 37 ++++++++++++++++++++++------- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h index d3351a2..956f781 100644 --- a/llvm/include/llvm/Object/ELFTypes.h +++ b/llvm/include/llvm/Object/ELFTypes.h @@ -885,6 +885,9 @@ struct PGOAnalysisMap { bool BBFreq : 1; bool BrProb : 1; + // True if at least one feature is enabled + bool anyEnabled() const { return FuncEntryCount || BBFreq || BrProb; } + // Encodes to minimum bit width representation. uint8_t encode() const { return (static_cast(FuncEntryCount) << 0) | diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 300639f..f24395b 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -774,7 +774,7 @@ decodeBBAddrMapImpl(const ELFFile &EF, } FunctionEntries.emplace_back(Address, std::move(BBEntries)); - if (FeatEnable.FuncEntryCount || FeatEnable.BBFreq || FeatEnable.BrProb) { + if (PGOAnalyses || FeatEnable.anyEnabled()) { // Function entry count uint64_t FuncEntryCount = FeatEnable.FuncEntryCount @@ -782,8 +782,9 @@ decodeBBAddrMapImpl(const ELFFile &EF, : 0; std::vector PGOBBEntries; - for (uint32_t BlockIndex = 0; !MetadataDecodeErr && !ULEBSizeErr && Cur && - (BlockIndex < NumBlocks); + for (uint32_t BlockIndex = 0; + (FeatEnable.BBFreq || FeatEnable.BrProb) && !MetadataDecodeErr && + !ULEBSizeErr && Cur && (BlockIndex < NumBlocks); ++BlockIndex) { // Block frequency uint64_t BBF = FeatEnable.BBFreq diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp index 48bebc5..3a2a8e3 100644 --- a/llvm/unittests/Object/ELFObjectFileTest.cpp +++ b/llvm/unittests/Object/ELFObjectFileTest.cpp @@ -1017,10 +1017,23 @@ Sections: BrProb: 0xffffffff - BBFreq: 1000 Successors: [] -)"); + - Name: .llvm_bb_addr_map_5 + Type: SHT_LLVM_BB_ADDR_MAP + # Link: 0 (by default, can be overriden) + Entries: + - Version: 2 + Address: 0x55555 + Feature: 0x0 + BBEntries: + - ID: 2 + AddressOffset: 0x0 + Size: 0x2 + Metadata: 0x4 + PGOAnalyses: [{}] + )"); BBAddrMap E1(0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}}}); - PGOAnalysisMap P1 = {892, {{}}, {true, false, false}}; + PGOAnalysisMap P1 = {892, {}, {true, false, false}}; BBAddrMap E2(0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}}}); PGOAnalysisMap P2 = {{}, {{BlockFrequency(343), {}}}, {false, true, false}}; BBAddrMap E3(0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}}, @@ -1049,16 +1062,18 @@ Sections: {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}}, {BlockFrequency(1000), {}}}, {true, true, true}}; + BBAddrMap E5(0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}}}); + PGOAnalysisMap P5 = {{}, {}, {false, false, false}}; - std::vector Section0BBAddrMaps = {E4}; + std::vector Section0BBAddrMaps = {E4, E5}; std::vector Section1BBAddrMaps = {E3}; std::vector Section2BBAddrMaps = {E1, E2}; - std::vector AllBBAddrMaps = {E1, E2, E3, E4}; + std::vector AllBBAddrMaps = {E1, E2, E3, E4, E5}; - std::vector Section0PGOAnalysisMaps = {P4}; + std::vector Section0PGOAnalysisMaps = {P4, P5}; std::vector Section1PGOAnalysisMaps = {P3}; std::vector Section2PGOAnalysisMaps = {P1, P2}; - std::vector AllPGOAnalysisMaps = {P1, P2, P3, P4}; + std::vector AllPGOAnalysisMaps = {P1, P2, P3, P4, P5}; auto DoCheckSucceeds = [&](StringRef YamlString, std::optional TextSectionIndex, @@ -1081,6 +1096,10 @@ Sections: if (ExpectedPGO) { EXPECT_EQ(BBAddrMaps->size(), PGOAnalyses.size()); EXPECT_EQ(PGOAnalyses, *ExpectedPGO); + for (auto &&[BB, PGO] : llvm::zip(*BBAddrMaps, PGOAnalyses)) { + if (PGO.FeatEnable.BBFreq || PGO.FeatEnable.BrProb) + EXPECT_EQ(BB.getBBEntries().size(), PGO.BBEntries.size()); + } } }; @@ -1132,9 +1151,9 @@ Sections: Link: 10 )"; - DoCheckFails(InvalidLinkedYamlString, /*TextSectionIndex=*/4, + DoCheckFails(InvalidLinkedYamlString, /*TextSectionIndex=*/5, "unable to get the linked-to section for " - "SHT_LLVM_BB_ADDR_MAP section with index 4: invalid section " + "SHT_LLVM_BB_ADDR_MAP section with index 5: invalid section " "index: 10"); // Linked sections are not checked when we don't target a specific text // section. @@ -1150,7 +1169,7 @@ Sections: )"; DoCheckFails(TruncatedYamlString, /*TextSectionIndex=*/std::nullopt, - "unable to read SHT_LLVM_BB_ADDR_MAP section with index 4: " + "unable to read SHT_LLVM_BB_ADDR_MAP section with index 5: " "unable to decode LEB128 at offset 0x0000000a: malformed " "uleb128, extends past end"); // Check that we can read the other section's bb-address-maps which are -- cgit v1.1 From 1637c0792550f70e4b2ef42b3d08aa91dd27f4a9 Mon Sep 17 00:00:00 2001 From: Chaitanya Date: Sat, 6 Jan 2024 09:34:48 +0530 Subject: [openmp][amdgpu] Add DynamicLdsSize to AMDGPUImplicitArgsTy (#65325) #65273 "hidden_dynamic_lds_size" argument will be added in the reserved section at offset 120 of the implicit argument layout Add DynamicLdsSize to AMDGPUImplicitArgsTy struct at offset 120 and fill the dynamic LDS size before kernel launch. --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp | 1 + openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 0411c67..18076f8 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3203,6 +3203,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, ImplArgs->GroupSizeY = 1; ImplArgs->GroupSizeZ = 1; ImplArgs->GridDims = 1; + ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem; } // Push the kernel launch into the stream. diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 2471590..58a3b5d 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -45,7 +45,9 @@ struct AMDGPUImplicitArgsTy { uint16_t GroupSizeZ; uint8_t Unused0[46]; // 46 byte offset. uint16_t GridDims; - uint8_t Unused1[190]; // 190 byte offset. + uint8_t Unused1[54]; // 54 byte offset. + uint32_t DynamicLdsSize; + uint8_t Unused2[132]; // 132 byte offset. }; // Dummy struct for COV4 implicitargs. -- cgit v1.1 From ba3ef331b4568b5996172076572581e68c2d3c0c Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Fri, 5 Jan 2024 23:19:46 -0500 Subject: [RISCV][GlobalISel] Zbkb support for G_BSWAP (#77050) This instructions is legal in the presence of Zbkb extension. --- llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 2 +- .../GlobalISel/instruction-select/bswap-rv32.mir | 15 +++--- .../GlobalISel/instruction-select/bswap-rv64.mir | 15 +++--- .../GlobalISel/legalizer/legalize-bswap-rv32.mir | 58 +++++++++++----------- .../GlobalISel/legalizer/legalize-bswap-rv64.mir | 58 +++++++++++----------- 5 files changed, 77 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 61bae58..ab80707 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -113,7 +113,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder(G_BITREVERSE).maxScalar(0, sXLen).lower(); auto &BSWAPActions = getActionDefinitionsBuilder(G_BSWAP); - if (ST.hasStdExtZbb()) + if (ST.hasStdExtZbb() || ST.hasStdExtZbkb()) BSWAPActions.legalFor({sXLen}).clampScalar(0, sXLen, sXLen); else BSWAPActions.maxScalar(0, sXLen).lower(); diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv32.mir index 733fd12..721721c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv32.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=instruction-select \ -# RUN: -simplify-mir -verify-machineinstrs %s -o - \ -# RUN: | FileCheck -check-prefix=RV32I %s +# RUN: -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=riscv32 -mattr=+zbkb -run-pass=instruction-select \ +# RUN: -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --- name: bswap_s32 @@ -9,11 +10,11 @@ legalized: true regBankSelected: true body: | bb.0.entry: - ; RV32I-LABEL: name: bswap_s32 - ; RV32I: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV32I-NEXT: [[REV8_RV32_:%[0-9]+]]:gpr = REV8_RV32 [[COPY]] - ; RV32I-NEXT: $x10 = COPY [[REV8_RV32_]] - ; RV32I-NEXT: PseudoRET implicit $x10 + ; CHECK-LABEL: name: bswap_s32 + ; CHECK: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: [[REV8_RV32_:%[0-9]+]]:gpr = REV8_RV32 [[COPY]] + ; CHECK-NEXT: $x10 = COPY [[REV8_RV32_]] + ; CHECK-NEXT: PseudoRET implicit $x10 %0:gprb(s32) = COPY $x10 %1:gprb(s32) = G_BSWAP %0 $x10 = COPY %1(s32) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv64.mir index 053abef..6cdfb76 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/bswap-rv64.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=instruction-select \ -# RUN: -simplify-mir -verify-machineinstrs %s -o - \ -# RUN: | FileCheck -check-prefix=RV64I %s +# RUN: -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=riscv64 -mattr=+zbkb -run-pass=instruction-select \ +# RUN: -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --- name: bswap_s64 @@ -9,11 +10,11 @@ legalized: true regBankSelected: true body: | bb.0.entry: - ; RV64I-LABEL: name: bswap_s64 - ; RV64I: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[REV8_RV64_:%[0-9]+]]:gpr = REV8_RV64 [[COPY]] - ; RV64I-NEXT: $x10 = COPY [[REV8_RV64_]] - ; RV64I-NEXT: PseudoRET implicit $x10 + ; CHECK-LABEL: name: bswap_s64 + ; CHECK: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: [[REV8_RV64_:%[0-9]+]]:gpr = REV8_RV64 [[COPY]] + ; CHECK-NEXT: $x10 = COPY [[REV8_RV64_]] + ; CHECK-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s64) = G_BSWAP %0 $x10 = COPY %1(s64) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv32.mir index e66dbfa..d6598c8 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv32.mir @@ -2,7 +2,9 @@ # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \ # RUN: | FileCheck %s --check-prefix=RV32I # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s --check-prefix=RV32ZBB +# RUN: | FileCheck %s --check-prefix=RV32ZBB_OR_RV32ZBKB +# RUN: llc -mtriple=riscv32 -mattr=+zbkb -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=RV32ZBB_OR_RV32ZBKB --- name: bswap_i16 @@ -23,16 +25,16 @@ body: | ; RV32I-NEXT: $x10 = COPY [[AND]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: bswap_i16 - ; RV32ZBB: liveins: $x10 - ; RV32ZBB-NEXT: {{ $}} - ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 - ; RV32ZBB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16 - ; RV32ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[ASSERT_ZEXT]] - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[LSHR]](s32) - ; RV32ZBB-NEXT: PseudoRET implicit $x10 + ; RV32ZBB_OR_RV32ZBKB-LABEL: name: bswap_i16 + ; RV32ZBB_OR_RV32ZBKB: liveins: $x10 + ; RV32ZBB_OR_RV32ZBKB-NEXT: {{ $}} + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[ASSERT_ZEXT]] + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s32) + ; RV32ZBB_OR_RV32ZBKB-NEXT: $x10 = COPY [[LSHR]](s32) + ; RV32ZBB_OR_RV32ZBKB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = G_ASSERT_ZEXT %0, 16 %2:_(s16) = G_TRUNC %1(s32) @@ -65,13 +67,13 @@ body: | ; RV32I-NEXT: $x10 = COPY [[OR2]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 ; - ; RV32ZBB-LABEL: name: bswap_i32 - ; RV32ZBB: liveins: $x10 - ; RV32ZBB-NEXT: {{ $}} - ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 - ; RV32ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]] - ; RV32ZBB-NEXT: $x10 = COPY [[BSWAP]](s32) - ; RV32ZBB-NEXT: PseudoRET implicit $x10 + ; RV32ZBB_OR_RV32ZBKB-LABEL: name: bswap_i32 + ; RV32ZBB_OR_RV32ZBKB: liveins: $x10 + ; RV32ZBB_OR_RV32ZBKB-NEXT: {{ $}} + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]] + ; RV32ZBB_OR_RV32ZBKB-NEXT: $x10 = COPY [[BSWAP]](s32) + ; RV32ZBB_OR_RV32ZBKB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = G_BSWAP %0 $x10 = COPY %1(s32) @@ -115,16 +117,16 @@ body: | ; RV32I-NEXT: $x11 = COPY [[OR5]](s32) ; RV32I-NEXT: PseudoRET implicit $x10, implicit $x11 ; - ; RV32ZBB-LABEL: name: bswap_i64 - ; RV32ZBB: liveins: $x10, $x11 - ; RV32ZBB-NEXT: {{ $}} - ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 - ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; RV32ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY1]] - ; RV32ZBB-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]] - ; RV32ZBB-NEXT: $x10 = COPY [[BSWAP]](s32) - ; RV32ZBB-NEXT: $x11 = COPY [[BSWAP1]](s32) - ; RV32ZBB-NEXT: PseudoRET implicit $x10, implicit $x11 + ; RV32ZBB_OR_RV32ZBKB-LABEL: name: bswap_i64 + ; RV32ZBB_OR_RV32ZBKB: liveins: $x10, $x11 + ; RV32ZBB_OR_RV32ZBKB-NEXT: {{ $}} + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY1]] + ; RV32ZBB_OR_RV32ZBKB-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]] + ; RV32ZBB_OR_RV32ZBKB-NEXT: $x10 = COPY [[BSWAP]](s32) + ; RV32ZBB_OR_RV32ZBKB-NEXT: $x11 = COPY [[BSWAP1]](s32) + ; RV32ZBB_OR_RV32ZBKB-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s32) = COPY $x10 %1:_(s32) = COPY $x11 %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv64.mir index b73a22c..61a0de9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bswap-rv64.mir @@ -2,7 +2,9 @@ # RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \ # RUN: | FileCheck %s --check-prefix=RV64I # RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s --check-prefix=RV64ZBB +# RUN: | FileCheck %s --check-prefix=RV64ZBB_OR_RV64ZBKB +# RUN: llc -mtriple=riscv64 -mattr=+zbkb -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=RV64ZBB_OR_RV64ZBKB --- name: bswap_i16 @@ -27,16 +29,16 @@ body: | ; RV64I-NEXT: $x10 = COPY [[AND]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: bswap_i16 - ; RV64ZBB: liveins: $x10 - ; RV64ZBB-NEXT: {{ $}} - ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 - ; RV64ZBB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 16 - ; RV64ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[ASSERT_ZEXT]] - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[LSHR]](s64) - ; RV64ZBB-NEXT: PseudoRET implicit $x10 + ; RV64ZBB_OR_RV64ZBKB-LABEL: name: bswap_i16 + ; RV64ZBB_OR_RV64ZBKB: liveins: $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: {{ $}} + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 16 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[ASSERT_ZEXT]] + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) + ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[LSHR]](s64) + ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = G_ASSERT_ZEXT %0, 16 %2:_(s16) = G_TRUNC %1(s64) @@ -74,16 +76,16 @@ body: | ; RV64I-NEXT: $x10 = COPY [[ZEXT]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: bswap_i32 - ; RV64ZBB: liveins: $x10 - ; RV64ZBB-NEXT: {{ $}} - ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 - ; RV64ZBB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 32 - ; RV64ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[ASSERT_ZEXT]] - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 - ; RV64ZBB-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[LSHR]](s64) - ; RV64ZBB-NEXT: PseudoRET implicit $x10 + ; RV64ZBB_OR_RV64ZBKB-LABEL: name: bswap_i32 + ; RV64ZBB_OR_RV64ZBKB: liveins: $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: {{ $}} + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 32 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[ASSERT_ZEXT]] + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) + ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[LSHR]](s64) + ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = G_ASSERT_ZEXT %0, 32 %2:_(s32) = G_TRUNC %1(s64) @@ -132,13 +134,13 @@ body: | ; RV64I-NEXT: $x10 = COPY [[OR6]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; - ; RV64ZBB-LABEL: name: bswap_i64 - ; RV64ZBB: liveins: $x10 - ; RV64ZBB-NEXT: {{ $}} - ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 - ; RV64ZBB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[COPY]] - ; RV64ZBB-NEXT: $x10 = COPY [[BSWAP]](s64) - ; RV64ZBB-NEXT: PseudoRET implicit $x10 + ; RV64ZBB_OR_RV64ZBKB-LABEL: name: bswap_i64 + ; RV64ZBB_OR_RV64ZBKB: liveins: $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: {{ $}} + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64ZBB_OR_RV64ZBKB-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[COPY]] + ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[BSWAP]](s64) + ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = G_BSWAP %0 $x10 = COPY %1(s64) -- cgit v1.1 From a5902a4d2425ac083f1530719e35b5c562cb1e60 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 11:33:36 +0800 Subject: [X86][NFC] Rename variables/passes for EVEX compression optimization RFC: https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031 APX introduces EGPR, NDD and NF instructions. In addition to compressing EVEX encoded AVX512 instructions into VEX encoding, we also have several more possible optimizations. a. Promoted instruction (EVEX space) -> pre-promotion instruction (legacy space) b. NDD (EVEX space) -> non-NDD (legacy space) c. NF_ND (EVEX space) -> NF (EVEX space) The first two types of compression can usually reduce code size, while the third type of compression can help hardware decode although the instruction length remains unchanged. So we do the renaming for the upcoming APX optimizations. BTW, I clang-format the code in X86CompressEVEX.cpp, X86CompressEVEXTablesEmitter.cpp. This patch also extracts the NFC in #77065 into a separate commit. --- llvm/lib/Target/X86/CMakeLists.txt | 4 +- llvm/lib/Target/X86/X86.h | 8 +- llvm/lib/Target/X86/X86CompressEVEX.cpp | 309 +++++++++++++++++++++ llvm/lib/Target/X86/X86EvexToVex.cpp | 296 -------------------- llvm/lib/Target/X86/X86TargetMachine.cpp | 4 +- llvm/test/CodeGen/X86/O0-pipeline.ll | 2 +- llvm/test/CodeGen/X86/evex-to-vex-compress.mir | 2 +- llvm/test/CodeGen/X86/opt-pipeline.ll | 2 +- llvm/utils/TableGen/CMakeLists.txt | 2 +- .../TableGen/X86CompressEVEXTablesEmitter.cpp | 209 ++++++++++++++ llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp | 210 -------------- 11 files changed, 530 insertions(+), 518 deletions(-) create mode 100644 llvm/lib/Target/X86/X86CompressEVEX.cpp delete mode 100644 llvm/lib/Target/X86/X86EvexToVex.cpp create mode 100644 llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp delete mode 100644 llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 0b7a98a..4d6300c 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -8,7 +8,7 @@ tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler) -tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) +tablegen(LLVM X86GenCompressEVEXTables.inc -gen-x86-compress-evex-tables) tablegen(LLVM X86GenExegesis.inc -gen-exegesis) tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) @@ -61,7 +61,7 @@ set(sources X86InstrFMA3Info.cpp X86InstrFoldTables.cpp X86InstrInfo.cpp - X86EvexToVex.cpp + X86CompressEVEX.cpp X86LoadValueInjectionLoadHardening.cpp X86LoadValueInjectionRetHardening.cpp X86MCInstLower.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 485afbc..21623a8 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -131,9 +131,9 @@ FunctionPass *createX86FixupBWInsts(); /// to another, when profitable. FunctionPass *createX86DomainReassignmentPass(); -/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX -/// encoding when possible in order to reduce code size. -FunctionPass *createX86EvexToVexInsts(); +/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when +/// possible in order to reduce code size or facilitate HW decoding. +FunctionPass *createX86CompressEVEXPass(); /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86IndirectThunksPass(); @@ -167,7 +167,7 @@ FunctionPass *createX86SpeculativeLoadHardeningPass(); FunctionPass *createX86SpeculativeExecutionSideEffectSuppression(); FunctionPass *createX86ArgumentStackSlotPass(); -void initializeEvexToVexInstPassPass(PassRegistry &); +void initializeCompressEVEXPassPass(PassRegistry &); void initializeFPSPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp new file mode 100644 index 0000000..accb98c --- /dev/null +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -0,0 +1,309 @@ +//===- X86CompressEVEX.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space +// when possible in order to reduce code size or facilitate HW decoding. +// +// Possible compression: +// a. AVX512 instruction (EVEX) -> AVX instruction (VEX) +// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy) +// c. NDD (EVEX) -> non-NDD (legacy) +// d. NF_ND (EVEX) -> NF (EVEX) +// +// Compression a, b and c always reduce code size (some exception) +// fourth type of compression can help hardware decode although the instruction +// length remains unchanged. +// +// Compression a, b and c can always reduce code size, with some exceptions +// such as promoted 16-bit CRC32 which is as long as the legacy version. +// +// legacy: +// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6] +// promoted: +// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6] +// +// From performance perspective, these should be same (same uops and same EXE +// ports). From a FMV perspective, an older legacy encoding is preferred b/c it +// can execute in more places (broader HW install base). So we will still do +// the compression. +// +// Compression d can help hardware decode (HW may skip reading the NDD +// register) although the instruction length remains unchanged. +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include +#include +#include + +using namespace llvm; + +// Including the generated EVEX compression tables. +struct X86CompressEVEXTableEntry { + uint16_t OldOpc; + uint16_t NewOpc; + + bool operator<(const X86CompressEVEXTableEntry &RHS) const { + return OldOpc < RHS.OldOpc; + } + + friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) { + return TE.OldOpc < Opc; + } +}; +#include "X86GenCompressEVEXTables.inc" + +#define COMP_EVEX_DESC "Compressing EVEX instrs when possible" +#define COMP_EVEX_NAME "x86-compress-evex" + +#define DEBUG_TYPE COMP_EVEX_NAME + +namespace { + +class CompressEVEXPass : public MachineFunctionPass { +public: + static char ID; + CompressEVEXPass() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { return COMP_EVEX_DESC; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } +}; + +} // end anonymous namespace + +char CompressEVEXPass::ID = 0; + +static bool usesExtendedRegister(const MachineInstr &MI) { + auto isHiRegIdx = [](unsigned Reg) { + // Check for XMM register with indexes between 16 - 31. + if (Reg >= X86::XMM16 && Reg <= X86::XMM31) + return true; + // Check for YMM register with indexes between 16 - 31. + if (Reg >= X86::YMM16 && Reg <= X86::YMM31) + return true; + // Check for GPR with indexes between 16 - 31. + if (X86II::isApxExtendedReg(Reg)) + return true; + return false; + }; + + // Check that operands are not ZMM regs or + // XMM/YMM regs with hi indexes between 16 - 31. + for (const MachineOperand &MO : MI.explicit_operands()) { + if (!MO.isReg()) + continue; + + Register Reg = MO.getReg(); + assert(!X86II::isZMMReg(Reg) && + "ZMM instructions should not be in the EVEX->VEX tables"); + if (isHiRegIdx(Reg)) + return true; + } + + return false; +} + +static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) { + switch (OldOpc) { + default: + return true; + case X86::VCVTNEPS2BF16Z128rm: + case X86::VCVTNEPS2BF16Z128rr: + case X86::VCVTNEPS2BF16Z256rm: + case X86::VCVTNEPS2BF16Z256rr: + return ST.hasAVXNECONVERT(); + case X86::VPDPBUSDSZ128m: + case X86::VPDPBUSDSZ128r: + case X86::VPDPBUSDSZ256m: + case X86::VPDPBUSDSZ256r: + case X86::VPDPBUSDZ128m: + case X86::VPDPBUSDZ128r: + case X86::VPDPBUSDZ256m: + case X86::VPDPBUSDZ256r: + case X86::VPDPWSSDSZ128m: + case X86::VPDPWSSDSZ128r: + case X86::VPDPWSSDSZ256m: + case X86::VPDPWSSDSZ256r: + case X86::VPDPWSSDZ128m: + case X86::VPDPWSSDZ128r: + case X86::VPDPWSSDZ256m: + case X86::VPDPWSSDZ256r: + return ST.hasAVXVNNI(); + case X86::VPMADD52HUQZ128m: + case X86::VPMADD52HUQZ128r: + case X86::VPMADD52HUQZ256m: + case X86::VPMADD52HUQZ256r: + case X86::VPMADD52LUQZ128m: + case X86::VPMADD52LUQZ128r: + case X86::VPMADD52LUQZ256m: + case X86::VPMADD52LUQZ256r: + return ST.hasAVXIFMA(); + } +} + +// Do any custom cleanup needed to finalize the conversion. +static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { + (void)NewOpc; + unsigned Opc = MI.getOpcode(); + switch (Opc) { + case X86::VALIGNDZ128rri: + case X86::VALIGNDZ128rmi: + case X86::VALIGNQZ128rri: + case X86::VALIGNQZ128rmi: { + assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) && + "Unexpected new opcode!"); + unsigned Scale = + (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4; + MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); + Imm.setImm(Imm.getImm() * Scale); + break; + } + case X86::VSHUFF32X4Z256rmi: + case X86::VSHUFF32X4Z256rri: + case X86::VSHUFF64X2Z256rmi: + case X86::VSHUFF64X2Z256rri: + case X86::VSHUFI32X4Z256rmi: + case X86::VSHUFI32X4Z256rri: + case X86::VSHUFI64X2Z256rmi: + case X86::VSHUFI64X2Z256rri: { + assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || + NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) && + "Unexpected new opcode!"); + MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); + int64_t ImmVal = Imm.getImm(); + // Set bit 5, move bit 1 to bit 4, copy bit 0. + Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1)); + break; + } + case X86::VRNDSCALEPDZ128rri: + case X86::VRNDSCALEPDZ128rmi: + case X86::VRNDSCALEPSZ128rri: + case X86::VRNDSCALEPSZ128rmi: + case X86::VRNDSCALEPDZ256rri: + case X86::VRNDSCALEPDZ256rmi: + case X86::VRNDSCALEPSZ256rri: + case X86::VRNDSCALEPSZ256rmi: + case X86::VRNDSCALESDZr: + case X86::VRNDSCALESDZm: + case X86::VRNDSCALESSZr: + case X86::VRNDSCALESSZm: + case X86::VRNDSCALESDZr_Int: + case X86::VRNDSCALESDZm_Int: + case X86::VRNDSCALESSZr_Int: + case X86::VRNDSCALESSZm_Int: + const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); + int64_t ImmVal = Imm.getImm(); + // Ensure that only bits 3:0 of the immediate are used. + if ((ImmVal & 0xf) != ImmVal) + return false; + break; + } + + return true; +} + +// For EVEX instructions that can be encoded using VEX encoding +// replace them by the VEX encoding in order to reduce size. +static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { + // VEX format. + // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 + // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] + // + // EVEX format. + // # of bytes: 4 1 1 1 4 / 1 1 + // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] + const MCInstrDesc &Desc = MI.getDesc(); + + // Check for EVEX instructions only. + if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) + return false; + + // Check for EVEX instructions with mask or broadcast as in these cases + // the EVEX prefix is needed in order to carry this information + // thus preventing the transformation to VEX encoding. + if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) + return false; + + // Check for EVEX instructions with L2 set. These instructions are 512-bits + // and can't be converted to VEX. + if (Desc.TSFlags & X86II::EVEX_L2) + return false; + + // Use the VEX.L bit to select the 128 or 256-bit table. + ArrayRef Table = + (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable) + : ArrayRef(X86EvexToVex128CompressTable); + + unsigned Opc = MI.getOpcode(); + const auto *I = llvm::lower_bound(Table, Opc); + if (I == Table.end() || I->OldOpc != Opc) + return false; + + if (usesExtendedRegister(MI)) + return false; + if (!checkVEXInstPredicate(Opc, ST)) + return false; + if (!performCustomAdjustments(MI, I->NewOpc)) + return false; + + MI.setDesc(ST.getInstrInfo()->get(I->NewOpc)); + MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + return true; +} + +bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { +#ifndef NDEBUG + // Make sure the tables are sorted. + static std::atomic TableChecked(false); + if (!TableChecked.load(std::memory_order_relaxed)) { + assert(llvm::is_sorted(X86EvexToVex128CompressTable) && + "X86EvexToVex128CompressTable is not sorted!"); + assert(llvm::is_sorted(X86EvexToVex256CompressTable) && + "X86EvexToVex256CompressTable is not sorted!"); + TableChecked.store(true, std::memory_order_relaxed); + } +#endif + const X86Subtarget &ST = MF.getSubtarget(); + if (!ST.hasAVX512()) + return false; + + bool Changed = false; + + /// Go over all basic blocks in function and replace + /// EVEX encoded instrs by VEX encoding when possible. + for (MachineBasicBlock &MBB : MF) { + // Traverse the basic block. + for (MachineInstr &MI : MBB) + Changed |= CompressEvexToVexImpl(MI, ST); + } + + return Changed; +} + +INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false) + +FunctionPass *llvm::createX86CompressEVEXPass() { + return new CompressEVEXPass(); +} diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp deleted file mode 100644 index c425c37..0000000 --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ /dev/null @@ -1,296 +0,0 @@ -//===- X86EvexToVex.cpp ---------------------------------------------------===// -// Compress EVEX instructions to VEX encoding when possible to reduce code size -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This file defines the pass that goes over all AVX-512 instructions which -/// are encoded using the EVEX prefix and if possible replaces them by their -/// corresponding VEX encoding which is usually shorter by 2 bytes. -/// EVEX instructions may be encoded via the VEX prefix when the AVX-512 -/// instruction has a corresponding AVX/AVX2 opcode, when vector length -/// accessed by instruction is less than 512 bits and when it does not use -// the xmm or the mask registers or xmm/ymm registers with indexes higher -// than 15. -/// The pass applies code reduction on the generated code for AVX-512 instrs. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86InstComments.h" -#include "X86.h" -#include "X86InstrInfo.h" -#include "X86Subtarget.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/Pass.h" -#include -#include -#include - -using namespace llvm; - -// Including the generated EVEX2VEX tables. -struct X86EvexToVexCompressTableEntry { - uint16_t EvexOpc; - uint16_t VexOpc; - - bool operator<(const X86EvexToVexCompressTableEntry &RHS) const { - return EvexOpc < RHS.EvexOpc; - } - - friend bool operator<(const X86EvexToVexCompressTableEntry &TE, - unsigned Opc) { - return TE.EvexOpc < Opc; - } -}; -#include "X86GenEVEX2VEXTables.inc" - -#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible" -#define EVEX2VEX_NAME "x86-evex-to-vex-compress" - -#define DEBUG_TYPE EVEX2VEX_NAME - -namespace { - -class EvexToVexInstPass : public MachineFunctionPass { -public: - static char ID; - EvexToVexInstPass() : MachineFunctionPass(ID) {} - StringRef getPassName() const override { return EVEX2VEX_DESC; } - - /// Loop over all of the basic blocks, replacing EVEX instructions - /// by equivalent VEX instructions when possible for reducing code size. - bool runOnMachineFunction(MachineFunction &MF) override; - - // This pass runs after regalloc and doesn't support VReg operands. - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } -}; - -} // end anonymous namespace - -char EvexToVexInstPass::ID = 0; - -static bool usesExtendedRegister(const MachineInstr &MI) { - auto isHiRegIdx = [](unsigned Reg) { - // Check for XMM register with indexes between 16 - 31. - if (Reg >= X86::XMM16 && Reg <= X86::XMM31) - return true; - // Check for YMM register with indexes between 16 - 31. - if (Reg >= X86::YMM16 && Reg <= X86::YMM31) - return true; - // Check for GPR with indexes between 16 - 31. - if (X86II::isApxExtendedReg(Reg)) - return true; - return false; - }; - - // Check that operands are not ZMM regs or - // XMM/YMM regs with hi indexes between 16 - 31. - for (const MachineOperand &MO : MI.explicit_operands()) { - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - assert(!X86II::isZMMReg(Reg) && - "ZMM instructions should not be in the EVEX->VEX tables"); - if (isHiRegIdx(Reg)) - return true; - } - - return false; -} - -static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) { - switch (EvexOpc) { - default: - return true; - case X86::VCVTNEPS2BF16Z128rm: - case X86::VCVTNEPS2BF16Z128rr: - case X86::VCVTNEPS2BF16Z256rm: - case X86::VCVTNEPS2BF16Z256rr: - return ST.hasAVXNECONVERT(); - case X86::VPDPBUSDSZ128m: - case X86::VPDPBUSDSZ128r: - case X86::VPDPBUSDSZ256m: - case X86::VPDPBUSDSZ256r: - case X86::VPDPBUSDZ128m: - case X86::VPDPBUSDZ128r: - case X86::VPDPBUSDZ256m: - case X86::VPDPBUSDZ256r: - case X86::VPDPWSSDSZ128m: - case X86::VPDPWSSDSZ128r: - case X86::VPDPWSSDSZ256m: - case X86::VPDPWSSDSZ256r: - case X86::VPDPWSSDZ128m: - case X86::VPDPWSSDZ128r: - case X86::VPDPWSSDZ256m: - case X86::VPDPWSSDZ256r: - return ST.hasAVXVNNI(); - case X86::VPMADD52HUQZ128m: - case X86::VPMADD52HUQZ128r: - case X86::VPMADD52HUQZ256m: - case X86::VPMADD52HUQZ256r: - case X86::VPMADD52LUQZ128m: - case X86::VPMADD52LUQZ128r: - case X86::VPMADD52LUQZ256m: - case X86::VPMADD52LUQZ256r: - return ST.hasAVXIFMA(); - } -} - -// Do any custom cleanup needed to finalize the conversion. -static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { - (void)VexOpc; - unsigned Opc = MI.getOpcode(); - switch (Opc) { - case X86::VALIGNDZ128rri: - case X86::VALIGNDZ128rmi: - case X86::VALIGNQZ128rri: - case X86::VALIGNQZ128rmi: { - assert((VexOpc == X86::VPALIGNRrri || VexOpc == X86::VPALIGNRrmi) && - "Unexpected new opcode!"); - unsigned Scale = - (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4; - MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); - Imm.setImm(Imm.getImm() * Scale); - break; - } - case X86::VSHUFF32X4Z256rmi: - case X86::VSHUFF32X4Z256rri: - case X86::VSHUFF64X2Z256rmi: - case X86::VSHUFF64X2Z256rri: - case X86::VSHUFI32X4Z256rmi: - case X86::VSHUFI32X4Z256rri: - case X86::VSHUFI64X2Z256rmi: - case X86::VSHUFI64X2Z256rri: { - assert((VexOpc == X86::VPERM2F128rr || VexOpc == X86::VPERM2I128rr || - VexOpc == X86::VPERM2F128rm || VexOpc == X86::VPERM2I128rm) && - "Unexpected new opcode!"); - MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); - int64_t ImmVal = Imm.getImm(); - // Set bit 5, move bit 1 to bit 4, copy bit 0. - Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1)); - break; - } - case X86::VRNDSCALEPDZ128rri: - case X86::VRNDSCALEPDZ128rmi: - case X86::VRNDSCALEPSZ128rri: - case X86::VRNDSCALEPSZ128rmi: - case X86::VRNDSCALEPDZ256rri: - case X86::VRNDSCALEPDZ256rmi: - case X86::VRNDSCALEPSZ256rri: - case X86::VRNDSCALEPSZ256rmi: - case X86::VRNDSCALESDZr: - case X86::VRNDSCALESDZm: - case X86::VRNDSCALESSZr: - case X86::VRNDSCALESSZm: - case X86::VRNDSCALESDZr_Int: - case X86::VRNDSCALESDZm_Int: - case X86::VRNDSCALESSZr_Int: - case X86::VRNDSCALESSZm_Int: - const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); - int64_t ImmVal = Imm.getImm(); - // Ensure that only bits 3:0 of the immediate are used. - if ((ImmVal & 0xf) != ImmVal) - return false; - break; - } - - return true; -} - -// For EVEX instructions that can be encoded using VEX encoding -// replace them by the VEX encoding in order to reduce size. -static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { - // VEX format. - // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 - // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] - // - // EVEX format. - // # of bytes: 4 1 1 1 4 / 1 1 - // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] - const MCInstrDesc &Desc = MI.getDesc(); - - // Check for EVEX instructions only. - if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) - return false; - - // Check for EVEX instructions with mask or broadcast as in these cases - // the EVEX prefix is needed in order to carry this information - // thus preventing the transformation to VEX encoding. - if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) - return false; - - // Check for EVEX instructions with L2 set. These instructions are 512-bits - // and can't be converted to VEX. - if (Desc.TSFlags & X86II::EVEX_L2) - return false; - - // Use the VEX.L bit to select the 128 or 256-bit table. - ArrayRef Table = - (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable) - : ArrayRef(X86EvexToVex128CompressTable); - - unsigned EvexOpc = MI.getOpcode(); - const auto *I = llvm::lower_bound(Table, EvexOpc); - if (I == Table.end() || I->EvexOpc != EvexOpc) - return false; - - if (usesExtendedRegister(MI)) - return false; - if (!checkVEXInstPredicate(EvexOpc, ST)) - return false; - if (!performCustomAdjustments(MI, I->VexOpc)) - return false; - - MI.setDesc(ST.getInstrInfo()->get(I->VexOpc)); - MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); - return true; -} - -bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { -#ifndef NDEBUG - // Make sure the tables are sorted. - static std::atomic TableChecked(false); - if (!TableChecked.load(std::memory_order_relaxed)) { - assert(llvm::is_sorted(X86EvexToVex128CompressTable) && - "X86EvexToVex128CompressTable is not sorted!"); - assert(llvm::is_sorted(X86EvexToVex256CompressTable) && - "X86EvexToVex256CompressTable is not sorted!"); - TableChecked.store(true, std::memory_order_relaxed); - } -#endif - const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX512()) - return false; - - bool Changed = false; - - /// Go over all basic blocks in function and replace - /// EVEX encoded instrs by VEX encoding when possible. - for (MachineBasicBlock &MBB : MF) { - // Traverse the basic block. - for (MachineInstr &MI : MBB) - Changed |= CompressEvexToVexImpl(MI, ST); - } - - return Changed; -} - -INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) - -FunctionPass *llvm::createX86EvexToVexInsts() { - return new EvexToVexInstPass(); -} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 5668b51..b92bffb 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -75,7 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); - initializeEvexToVexInstPassPass(PR); + initializeCompressEVEXPassPass(PR); initializeFixupLEAPassPass(PR); initializeFPSPass(PR); initializeX86FixupSetCCPassPass(PR); @@ -575,7 +575,7 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupInstTuning()); addPass(createX86FixupVectorConstants()); } - addPass(createX86EvexToVexInsts()); + addPass(createX86CompressEVEXPass()); addPass(createX86DiscriminateMemOpsPass()); addPass(createX86InsertPrefetchPass()); addPass(createX86InsertX87waitPass()); diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 402645e..11025b0 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -68,7 +68,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter -; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possibl +; CHECK-NEXT: Compressing EVEX instrs when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir index 06d3c15..548cf24 100644 --- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir +++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=x86_64-- -run-pass x86-evex-to-vex-compress -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s +# RUN: llc -mtriple=x86_64-- -run-pass x86-compress-evex -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s # This test verifies VEX encoding for AVX-512 instructions that use registers of low indexes and # do not use zmm or mask registers and have a corresponding AVX/AVX2 opcode diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index fb8d233..6f2bba8 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -205,7 +205,7 @@ ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: X86 Fixup Inst Tuning ; CHECK-NEXT: X86 Fixup Vector Constants -; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible +; CHECK-NEXT: Compressing EVEX instrs when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index f765cc3..0100bf3 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -82,7 +82,7 @@ add_tablegen(llvm-tblgen LLVM Types.cpp VarLenCodeEmitterGen.cpp X86DisassemblerTables.cpp - X86EVEX2VEXTablesEmitter.cpp + X86CompressEVEXTablesEmitter.cpp X86FoldTablesEmitter.cpp X86MnemonicTables.cpp X86ModRMFilters.cpp diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp new file mode 100644 index 0000000..c1ea34d --- /dev/null +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -0,0 +1,209 @@ +//==- utils/TableGen/X86CompressEVEXTablesEmitter.cpp - X86 backend-*- C++ -*-// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This tablegen backend is responsible for emitting the X86 backend EVEX +/// compression tables. +/// +//===----------------------------------------------------------------------===// + +#include "CodeGenInstruction.h" +#include "CodeGenTarget.h" +#include "X86RecognizableInstr.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/TableGenBackend.h" + +using namespace llvm; +using namespace X86Disassembler; + +namespace { + +class X86CompressEVEXTablesEmitter { + RecordKeeper &Records; + CodeGenTarget Target; + + // Hold all pontentially compressible EVEX instructions + std::vector PreCompressionInsts; + // Hold all compressed instructions. Divided into groups with same opcodes + // to make the search more efficient + std::map> CompressedInsts; + + typedef std::pair + Entry; + + // Represent both compress tables + std::vector EVEX2VEX128; + std::vector EVEX2VEX256; + +public: + X86CompressEVEXTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {} + + // run - Output X86 EVEX compression tables. + void run(raw_ostream &OS); + +private: + // Prints the given table as a C++ array of type X86CompressEVEXTableEntry + void printTable(const std::vector &Table, raw_ostream &OS); +}; + +void X86CompressEVEXTablesEmitter::printTable(const std::vector &Table, + raw_ostream &OS) { + StringRef Size = (Table == EVEX2VEX128) ? "128" : "256"; + + OS << "// X86 EVEX encoded instructions that have a VEX " << Size + << " encoding\n" + << "// (table format: ).\n" + << "static const X86CompressEVEXTableEntry X86EvexToVex" << Size + << "CompressTable[] = {\n" + << " // EVEX scalar with corresponding VEX.\n"; + + // Print all entries added to the table + for (const auto &Pair : Table) { + OS << " { X86::" << Pair.first->TheDef->getName() + << ", X86::" << Pair.second->TheDef->getName() << " },\n"; + } + + OS << "};\n\n"; +} + +// Return true if the 2 BitsInits are equal +// Calculates the integer value residing BitsInit object +static inline uint64_t getValueFromBitsInit(const BitsInit *B) { + uint64_t Value = 0; + for (unsigned i = 0, e = B->getNumBits(); i != e; ++i) { + if (BitInit *Bit = dyn_cast(B->getBit(i))) + Value |= uint64_t(Bit->getValue()) << i; + else + PrintFatalError("Invalid VectSize bit"); + } + return Value; +} + +// Function object - Operator() returns true if the given VEX instruction +// matches the EVEX instruction of this object. +class IsMatch { + const CodeGenInstruction *EVEXInst; + +public: + IsMatch(const CodeGenInstruction *EVEXInst) : EVEXInst(EVEXInst) {} + + bool operator()(const CodeGenInstruction *VEXInst) { + RecognizableInstrBase VEXRI(*VEXInst); + RecognizableInstrBase EVEXRI(*EVEXInst); + bool VEX_W = VEXRI.HasREX_W; + bool EVEX_W = EVEXRI.HasREX_W; + bool VEX_WIG = VEXRI.IgnoresW; + bool EVEX_WIG = EVEXRI.IgnoresW; + bool EVEX_W1_VEX_W0 = EVEXInst->TheDef->getValueAsBit("EVEX_W1_VEX_W0"); + + if (VEXRI.IsCodeGenOnly != EVEXRI.IsCodeGenOnly || + // VEX/EVEX fields + VEXRI.OpPrefix != EVEXRI.OpPrefix || VEXRI.OpMap != EVEXRI.OpMap || + VEXRI.HasVEX_4V != EVEXRI.HasVEX_4V || + VEXRI.HasVEX_L != EVEXRI.HasVEX_L || + // Match is allowed if either is VEX_WIG, or they match, or EVEX + // is VEX_W1X and VEX is VEX_W0. + (!(VEX_WIG || (!EVEX_WIG && EVEX_W == VEX_W) || + (EVEX_W1_VEX_W0 && EVEX_W && !VEX_W))) || + // Instruction's format + VEXRI.Form != EVEXRI.Form) + return false; + + // This is needed for instructions with intrinsic version (_Int). + // Where the only difference is the size of the operands. + // For example: VUCOMISDZrm and Int_VUCOMISDrm + // Also for instructions that their EVEX version was upgraded to work with + // k-registers. For example VPCMPEQBrm (xmm output register) and + // VPCMPEQBZ128rm (k register output register). + for (unsigned i = 0, e = EVEXInst->Operands.size(); i < e; i++) { + Record *OpRec1 = EVEXInst->Operands[i].Rec; + Record *OpRec2 = VEXInst->Operands[i].Rec; + + if (OpRec1 == OpRec2) + continue; + + if (isRegisterOperand(OpRec1) && isRegisterOperand(OpRec2)) { + if (getRegOperandSize(OpRec1) != getRegOperandSize(OpRec2)) + return false; + } else if (isMemoryOperand(OpRec1) && isMemoryOperand(OpRec2)) { + return false; + } else if (isImmediateOperand(OpRec1) && isImmediateOperand(OpRec2)) { + if (OpRec1->getValueAsDef("Type") != OpRec2->getValueAsDef("Type")) { + return false; + } + } else + return false; + } + + return true; + } +}; + +void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { + emitSourceFileHeader("X86 EVEX compression tables", OS); + + ArrayRef NumberedInstructions = + Target.getInstructionsByEnumValue(); + + for (const CodeGenInstruction *Inst : NumberedInstructions) { + const Record *Def = Inst->TheDef; + // Filter non-X86 instructions. + if (!Def->isSubClassOf("X86Inst")) + continue; + // _REV instruction should not appear before encoding optimization + if (Def->getName().ends_with("_REV")) + continue; + RecognizableInstrBase RI(*Inst); + + // Add VEX encoded instructions to one of CompressedInsts vectors according + // to it's opcode. + if (RI.Encoding == X86Local::VEX) + CompressedInsts[RI.Opcode].push_back(Inst); + // Add relevant EVEX encoded instructions to PreCompressionInsts + else if (RI.Encoding == X86Local::EVEX && !RI.HasEVEX_K && !RI.HasEVEX_B && + !RI.HasEVEX_L2 && !Def->getValueAsBit("notEVEX2VEXConvertible")) + PreCompressionInsts.push_back(Inst); + } + + for (const CodeGenInstruction *EVEXInst : PreCompressionInsts) { + uint64_t Opcode = + getValueFromBitsInit(EVEXInst->TheDef->getValueAsBitsInit("Opcode")); + // For each EVEX instruction look for a VEX match in the appropriate vector + // (instructions with the same opcode) using function object IsMatch. + // Allow EVEX2VEXOverride to explicitly specify a match. + const CodeGenInstruction *VEXInst = nullptr; + if (!EVEXInst->TheDef->isValueUnset("EVEX2VEXOverride")) { + StringRef AltInstStr = + EVEXInst->TheDef->getValueAsString("EVEX2VEXOverride"); + Record *AltInstRec = Records.getDef(AltInstStr); + assert(AltInstRec && "EVEX2VEXOverride instruction not found!"); + VEXInst = &Target.getInstruction(AltInstRec); + } else { + auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(EVEXInst)); + if (Match != CompressedInsts[Opcode].end()) + VEXInst = *Match; + } + + if (!VEXInst) + continue; + + // In case a match is found add new entry to the appropriate table + if (EVEXInst->TheDef->getValueAsBit("hasVEX_L")) + EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1} + else + EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0} + } + + // Print both tables + printTable(EVEX2VEX128, OS); + printTable(EVEX2VEX256, OS); +} +} // namespace + +static TableGen::Emitter::OptClass + X("gen-x86-compress-evex-tables", "Generate X86 EVEX compression tables"); diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp deleted file mode 100644 index c80d9a1..0000000 --- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp +++ /dev/null @@ -1,210 +0,0 @@ -//===- utils/TableGen/X86EVEX2VEXTablesEmitter.cpp - X86 backend-*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// This tablegen backend is responsible for emitting the X86 backend EVEX2VEX -/// compression tables. -/// -//===----------------------------------------------------------------------===// - -#include "CodeGenInstruction.h" -#include "CodeGenTarget.h" -#include "X86RecognizableInstr.h" -#include "llvm/TableGen/Error.h" -#include "llvm/TableGen/Record.h" -#include "llvm/TableGen/TableGenBackend.h" - -using namespace llvm; -using namespace X86Disassembler; - -namespace { - -class X86EVEX2VEXTablesEmitter { - RecordKeeper &Records; - CodeGenTarget Target; - - // Hold all non-masked & non-broadcasted EVEX encoded instructions - std::vector EVEXInsts; - // Hold all VEX encoded instructions. Divided into groups with same opcodes - // to make the search more efficient - std::map> VEXInsts; - - typedef std::pair - Entry; - - // Represent both compress tables - std::vector EVEX2VEX128; - std::vector EVEX2VEX256; - -public: - X86EVEX2VEXTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {} - - // run - Output X86 EVEX2VEX tables. - void run(raw_ostream &OS); - -private: - // Prints the given table as a C++ array of type - // X86EvexToVexCompressTableEntry - void printTable(const std::vector &Table, raw_ostream &OS); -}; - -void X86EVEX2VEXTablesEmitter::printTable(const std::vector &Table, - raw_ostream &OS) { - StringRef Size = (Table == EVEX2VEX128) ? "128" : "256"; - - OS << "// X86 EVEX encoded instructions that have a VEX " << Size - << " encoding\n" - << "// (table format: ).\n" - << "static const X86EvexToVexCompressTableEntry X86EvexToVex" << Size - << "CompressTable[] = {\n" - << " // EVEX scalar with corresponding VEX.\n"; - - // Print all entries added to the table - for (const auto &Pair : Table) { - OS << " { X86::" << Pair.first->TheDef->getName() - << ", X86::" << Pair.second->TheDef->getName() << " },\n"; - } - - OS << "};\n\n"; -} - -// Return true if the 2 BitsInits are equal -// Calculates the integer value residing BitsInit object -static inline uint64_t getValueFromBitsInit(const BitsInit *B) { - uint64_t Value = 0; - for (unsigned i = 0, e = B->getNumBits(); i != e; ++i) { - if (BitInit *Bit = dyn_cast(B->getBit(i))) - Value |= uint64_t(Bit->getValue()) << i; - else - PrintFatalError("Invalid VectSize bit"); - } - return Value; -} - -// Function object - Operator() returns true if the given VEX instruction -// matches the EVEX instruction of this object. -class IsMatch { - const CodeGenInstruction *EVEXInst; - -public: - IsMatch(const CodeGenInstruction *EVEXInst) : EVEXInst(EVEXInst) {} - - bool operator()(const CodeGenInstruction *VEXInst) { - RecognizableInstrBase VEXRI(*VEXInst); - RecognizableInstrBase EVEXRI(*EVEXInst); - bool VEX_W = VEXRI.HasREX_W; - bool EVEX_W = EVEXRI.HasREX_W; - bool VEX_WIG = VEXRI.IgnoresW; - bool EVEX_WIG = EVEXRI.IgnoresW; - bool EVEX_W1_VEX_W0 = EVEXInst->TheDef->getValueAsBit("EVEX_W1_VEX_W0"); - - if (VEXRI.IsCodeGenOnly != EVEXRI.IsCodeGenOnly || - // VEX/EVEX fields - VEXRI.OpPrefix != EVEXRI.OpPrefix || VEXRI.OpMap != EVEXRI.OpMap || - VEXRI.HasVEX_4V != EVEXRI.HasVEX_4V || - VEXRI.HasVEX_L != EVEXRI.HasVEX_L || - // Match is allowed if either is VEX_WIG, or they match, or EVEX - // is VEX_W1X and VEX is VEX_W0. - (!(VEX_WIG || (!EVEX_WIG && EVEX_W == VEX_W) || - (EVEX_W1_VEX_W0 && EVEX_W && !VEX_W))) || - // Instruction's format - VEXRI.Form != EVEXRI.Form) - return false; - - // This is needed for instructions with intrinsic version (_Int). - // Where the only difference is the size of the operands. - // For example: VUCOMISDZrm and Int_VUCOMISDrm - // Also for instructions that their EVEX version was upgraded to work with - // k-registers. For example VPCMPEQBrm (xmm output register) and - // VPCMPEQBZ128rm (k register output register). - for (unsigned i = 0, e = EVEXInst->Operands.size(); i < e; i++) { - Record *OpRec1 = EVEXInst->Operands[i].Rec; - Record *OpRec2 = VEXInst->Operands[i].Rec; - - if (OpRec1 == OpRec2) - continue; - - if (isRegisterOperand(OpRec1) && isRegisterOperand(OpRec2)) { - if (getRegOperandSize(OpRec1) != getRegOperandSize(OpRec2)) - return false; - } else if (isMemoryOperand(OpRec1) && isMemoryOperand(OpRec2)) { - return false; - } else if (isImmediateOperand(OpRec1) && isImmediateOperand(OpRec2)) { - if (OpRec1->getValueAsDef("Type") != OpRec2->getValueAsDef("Type")) { - return false; - } - } else - return false; - } - - return true; - } -}; - -void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) { - emitSourceFileHeader("X86 EVEX2VEX tables", OS); - - ArrayRef NumberedInstructions = - Target.getInstructionsByEnumValue(); - - for (const CodeGenInstruction *Inst : NumberedInstructions) { - const Record *Def = Inst->TheDef; - // Filter non-X86 instructions. - if (!Def->isSubClassOf("X86Inst")) - continue; - // _REV instruction should not appear before encoding optimization - if (Def->getName().ends_with("_REV")) - continue; - RecognizableInstrBase RI(*Inst); - - // Add VEX encoded instructions to one of VEXInsts vectors according to - // it's opcode. - if (RI.Encoding == X86Local::VEX) - VEXInsts[RI.Opcode].push_back(Inst); - // Add relevant EVEX encoded instructions to EVEXInsts - else if (RI.Encoding == X86Local::EVEX && !RI.HasEVEX_K && !RI.HasEVEX_B && - !RI.HasEVEX_L2 && !Def->getValueAsBit("notEVEX2VEXConvertible")) - EVEXInsts.push_back(Inst); - } - - for (const CodeGenInstruction *EVEXInst : EVEXInsts) { - uint64_t Opcode = getValueFromBitsInit(EVEXInst->TheDef-> - getValueAsBitsInit("Opcode")); - // For each EVEX instruction look for a VEX match in the appropriate vector - // (instructions with the same opcode) using function object IsMatch. - // Allow EVEX2VEXOverride to explicitly specify a match. - const CodeGenInstruction *VEXInst = nullptr; - if (!EVEXInst->TheDef->isValueUnset("EVEX2VEXOverride")) { - StringRef AltInstStr = - EVEXInst->TheDef->getValueAsString("EVEX2VEXOverride"); - Record *AltInstRec = Records.getDef(AltInstStr); - assert(AltInstRec && "EVEX2VEXOverride instruction not found!"); - VEXInst = &Target.getInstruction(AltInstRec); - } else { - auto Match = llvm::find_if(VEXInsts[Opcode], IsMatch(EVEXInst)); - if (Match != VEXInsts[Opcode].end()) - VEXInst = *Match; - } - - if (!VEXInst) - continue; - - // In case a match is found add new entry to the appropriate table - if (EVEXInst->TheDef->getValueAsBit("hasVEX_L")) - EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1} - else - EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0} - } - - // Print both tables - printTable(EVEX2VEX128, OS); - printTable(EVEX2VEX256, OS); -} -} // namespace - -static TableGen::Emitter::OptClass - X("gen-x86-EVEX2VEX-tables", "Generate X86 EVEX to VEX compress tables"); -- cgit v1.1 From 241e4c7466b877265e1645ca4709fe666c95c6c4 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 6 Jan 2024 13:39:22 +0900 Subject: [Bazel] Fixup for #77008 (`orc::SymbolMap`) --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index f035a17..6d16230 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2719,6 +2719,7 @@ cc_library( ]), hdrs = glob([ "include/llvm/ExecutionEngine/JITLink/*.h", + "include/llvm/ExecutionEngine/Orc/*.h", ]), copts = llvm_copts, deps = [ -- cgit v1.1 From 16094cb629159ee0896e2ca1facc15118b229665 Mon Sep 17 00:00:00 2001 From: hev Date: Sat, 6 Jan 2024 13:36:09 +0800 Subject: [llvm][LoongArch] Support per-global code model attribute for LoongArch (#72079) This patch gets the code model from global variable attribute if it has, otherwise the target's will be used. --------- Signed-off-by: WANG Rui --- .../lib/Target/LoongArch/LoongArchISelLowering.cpp | 22 ++++++++--- llvm/lib/Target/LoongArch/LoongArchISelLowering.h | 3 +- .../LoongArch/global-variable-code-model.ll | 44 ++++++++++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/global-variable-code-model.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 2cfb2c1..3e75b9f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -762,12 +762,13 @@ static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty, template SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, + CodeModel::Model M, bool IsLocal) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); - switch (DAG.getTarget().getCodeModel()) { + switch (M) { default: report_fatal_error("Unsupported code model"); @@ -808,24 +809,35 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast(Op), DAG); + return getAddr(cast(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast(Op), DAG); + return getAddr(cast(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast(Op), DAG); + return getAddr(cast(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *N = cast(Op); assert(N->getOffset() == 0 && "unexpected offset in global node"); - return getAddr(N, DAG, N->getGlobal()->isDSOLocal()); + auto CM = DAG.getTarget().getCodeModel(); + const GlobalValue *GV = N->getGlobal(); + + if (GV->isDSOLocal() && isa(GV)) { + if (auto GCM = dyn_cast(GV)->getCodeModel()) + CM = *GCM; + } + + return getAddr(N, DAG, CM, GV->isDSOLocal()); } SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 2875aa8..7218262 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -254,7 +254,8 @@ private: LoongArchCCAssignFn Fn) const; template - SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M, + bool IsLocal = true) const; SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, unsigned Opc, bool Large = false) const; SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll new file mode 100644 index 0000000..aa47808 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s + +@a= external dso_local global i32, code_model "small", align 4 + +define dso_local signext i32 @local_small() #0 { +; CHECK-LABEL: local_small: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(a) +; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(a) +; CHECK-NEXT: ld.w $a0, $a0, 0 +; CHECK-NEXT: ret + %1 = load i32, ptr @a, align 4 + ret i32 %1 +} + +@b= external dso_local global i32, code_model "large", align 4 + +define dso_local signext i32 @local_large() #0 { +; CHECK-LABEL: local_large: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(b) +; CHECK-NEXT: addi.d $t8, $zero, %pc_lo12(b) +; CHECK-NEXT: lu32i.d $t8, %pc64_lo20(b) +; CHECK-NEXT: lu52i.d $t8, $t8, %pc64_hi12(b) +; CHECK-NEXT: add.d $a0, $t8, $a0 +; CHECK-NEXT: ld.w $a0, $a0, 0 +; CHECK-NEXT: ret + %1 = load i32, ptr @b, align 4 + ret i32 %1 +} + +@c= external global i32, code_model "large", align 4 + +define dso_local signext i32 @non_local_large() #0 { +; CHECK-LABEL: non_local_large: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(c) +; CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(c) +; CHECK-NEXT: ld.w $a0, $a0, 0 +; CHECK-NEXT: ret + %1 = load i32, ptr @c, align 4 + ret i32 %1 +} -- cgit v1.1 From 602c8fa2d8da6562e4f36df3bd63c26a4c7461e7 Mon Sep 17 00:00:00 2001 From: Craig Hesling Date: Sat, 6 Jan 2024 01:08:59 -0500 Subject: [GitHub] Fix slow sccache install on macOS by upgrading macOS version (#77165) The "Setup ccache" step on macOS-11 builds takes between 15 to 20 mins, whereas this step takes a less than 10 seconds on other runners. The bulk of this time is spent at the "Install sccache" step, where brew emits warnings like "Warning: You are using macOS 11." and "We (and Apple) do not provide support for this old version...". Bumping the version of macOS greatly decreases this cache setup time to about 20 seconds. Furthermore, it seems like it is speeding up general build times, too. It appears that https://github.com/actions/virtual-environments/issues/5900 has been resolved or obsoleted, so I do not believe we need to lock macOS to 11 anymore. --- .github/workflows/llvm-project-tests.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml index 02b1ab7..3345e73 100644 --- a/.github/workflows/llvm-project-tests.yml +++ b/.github/workflows/llvm-project-tests.yml @@ -14,7 +14,7 @@ on: required: false os_list: required: false - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-12"]' workflow_call: inputs: build_target: @@ -34,9 +34,7 @@ on: type: string # Use windows-2019 due to: # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317 - # We're using a specific version of macOS due to: - # https://github.com/actions/virtual-environments/issues/5900 - default: '["ubuntu-latest", "windows-2019", "macOS-11"]' + default: '["ubuntu-latest", "windows-2019", "macOS-12"]' concurrency: # Skip intermediate builds: always. @@ -91,10 +89,6 @@ jobs: variant: sccache - name: Build and Test uses: llvm/actions/build-test-llvm-project@main - env: - # Workaround for https://github.com/actions/virtual-environments/issues/5900. - # This should be a no-op for non-mac OSes - PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12 with: cmake_args: '-GNinja -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLDB_INCLUDE_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache ${{ inputs.extra_cmake_args }}' build_target: '${{ inputs.build_target }}' -- cgit v1.1 From cf02e6e71064ba2ce36c354e3bd6b2d57de29d85 Mon Sep 17 00:00:00 2001 From: Craig Hesling Date: Sat, 6 Jan 2024 01:15:10 -0500 Subject: [GitHub] Remove redundant cache key prefix (#76914) Remove the redundant sccache cache key prefix. This prefix is already added by the ccache action, which results in cache keys like "sccache-sccache-ubuntu-...". See the following source lines as proof: https://github.com/hendrikmuhs/ccache-action/blob/2a51777f6f64b7b7bea213601acba8f5f4fdbe03/src/restore.ts#L22-L23 --- .github/workflows/llvm-project-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml index 3345e73..594831e 100644 --- a/.github/workflows/llvm-project-tests.yml +++ b/.github/workflows/llvm-project-tests.yml @@ -85,7 +85,7 @@ jobs: # enough cache space for all the tests to run at once and still # fit under the 10 GB limit. max-size: 500M - key: sccache-${{ matrix.os }} + key: ${{ matrix.os }} variant: sccache - name: Build and Test uses: llvm/actions/build-test-llvm-project@main -- cgit v1.1 From 376baeb2d535826eb2d8158c4147e37cda493f35 Mon Sep 17 00:00:00 2001 From: Craig Hesling Date: Sat, 6 Jan 2024 01:22:07 -0500 Subject: [GitHub] Add basic CI for libclang Python binding unit tests (#76784) This is important to aid development of Python type annotations in the libclang binding. See https://github.com/llvm/llvm-project/issues/76664 for more details. * Run on all pull requests and direct pushes. * This makes use of the existing llvm-project-tests.yml recipe, which will preload ccache from previous runs. * Building libclang currently takes about 9mins when ccache is warm and about an 1hr 20mins if it is cold using the standard GitHub ubuntu runner. * In the future, this could be broken into the following discrete steps for clarity: 1. Build libclang dependency. ninja -C build libclang 2. Run Python unit tests. ninja -C build check-clang-python * Followup changes will bring testing on older python versions and static type checking. Issue https://github.com/llvm/llvm-project/issues/76601. --- .github/workflows/libclang-python-tests.yml | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/libclang-python-tests.yml diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml new file mode 100644 index 0000000..73edb6c --- /dev/null +++ b/.github/workflows/libclang-python-tests.yml @@ -0,0 +1,39 @@ +name: Libclang Python Binding Tests + +permissions: + contents: read + +on: + workflow_dispatch: + push: + paths: + - 'clang/bindings/python/**' + - 'clang/tools/libclang/**' + - 'clang/CMakeList.txt' + - '.github/workflows/libclang-python-tests.yml' + - '.github/workflows/llvm-project-tests.yml' + pull_request: + paths: + - 'clang/bindings/python/**' + - 'clang/tools/libclang/**' + - 'clang/CMakeList.txt' + - '.github/workflows/libclang-python-tests.yml' + - '.github/workflows/llvm-project-tests.yml' + +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + +jobs: + check-clang-python: + # Build libclang and then run the libclang Python binding's unit tests. + name: Build and run Python unit tests + uses: ./.github/workflows/llvm-project-tests.yml + with: + build_target: check-clang-python + projects: clang + # There is an issue running on "windows-2019". + # See https://github.com/llvm/llvm-project/issues/76601#issuecomment-1873049082. + os_list: '["ubuntu-latest"]' -- cgit v1.1 From 80dbf601d1815ff90b5aee18f426da964920dbe7 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 15:29:25 +0800 Subject: [X86][NFC] Remove EVEX2VEXOverride/NotEVEX2VEXConvertible Remove these two classes and put all the entries in X86 EVEX compression tables that need special handling in .def file. PR #77065 tries to add entries that need special handling for APX in .def file. Compared to setting fields in td files, that method looks cleaner. This patch is to unify the addition of manual entries. --- llvm/lib/Target/X86/X86InstrAVX512.td | 230 ++++++++------------- llvm/lib/Target/X86/X86InstrFormats.td | 4 - llvm/lib/Target/X86/X86InstrUtils.td | 4 - .../TableGen/X86CompressEVEXTablesEmitter.cpp | 53 +++-- .../utils/TableGen/X86ManualCompressEVEXTables.def | 88 ++++++++ 5 files changed, 210 insertions(+), 169 deletions(-) create mode 100644 llvm/utils/TableGen/X86ManualCompressEVEXTables.def diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index c3a673f..6664671 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3185,15 +3185,13 @@ defm : operation_subvector_mask_lowering; multiclass avx512_load opc, string OpcodeStr, string Name, X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, - X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, - bit NoRMPattern = 0, + X86SchedWriteMoveLS Sched, bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { let isMoveReg = 1 in def rr : AVX512PI, EVEX, Sched<[Sched.RR]>, - EVEX2VEXOverride; + _.ExeDomain>, EVEX, Sched<[Sched.RR]>; def rrkz : AVX512PI opc, string OpcodeStr, string Name, !if(NoRMPattern, [], [(set _.RC:$dst, (_.VT (ld_frag addr:$src)))]), - _.ExeDomain>, EVEX, Sched<[Sched.RM]>, - EVEX2VEXOverride; + _.ExeDomain>, EVEX, Sched<[Sched.RM]>; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { def rrk : AVX512PI opc, string OpcodeStr, string Name, multiclass avx512_alignedload_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoRMPattern = 0> { + bit NoRMPattern = 0> { let Predicates = [prd] in defm Z : avx512_load, EVEX_V512; + Sched.ZMM, NoRMPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load, EVEX_V256; + Sched.YMM, NoRMPattern>, EVEX_V256; defm Z128 : avx512_load, EVEX_V128; + Sched.XMM, NoRMPattern>, EVEX_V128; } } multiclass avx512_load_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoRMPattern = 0, + bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load, EVEX_V512; + masked_load, Sched.ZMM, NoRMPattern, SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load, EVEX_V256; + masked_load, Sched.YMM, NoRMPattern, SelectOprr>, EVEX_V256; defm Z128 : avx512_load, EVEX_V128; + masked_load, Sched.XMM, NoRMPattern, SelectOprr>, EVEX_V128; } } multiclass avx512_store opc, string OpcodeStr, string BaseName, X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, - X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, - bit NoMRPattern = 0> { + X86SchedWriteMoveLS Sched, bit NoMRPattern = 0> { let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { let isMoveReg = 1 in def rr_REV : AVX512PI, EVEX, - Sched<[Sched.RR]>, - EVEX2VEXOverride; + Sched<[Sched.RR]>; def rrk_REV : AVX512PI opc, string OpcodeStr, string BaseName, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), !if(NoMRPattern, [], [(st_frag (_.VT _.RC:$src), addr:$dst)]), - _.ExeDomain>, EVEX, Sched<[Sched.MR]>, - EVEX2VEXOverride; + _.ExeDomain>, EVEX, Sched<[Sched.MR]>; def mrk : AVX512PI opc, string OpcodeStr, string BaseName, multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoMRPattern = 0> { + bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; + masked_store, Sched.ZMM, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; + masked_store, Sched.YMM, NoMRPattern>, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; + masked_store, Sched.XMM, NoMRPattern>, EVEX_V128; } } multiclass avx512_alignedstore_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoMRPattern = 0> { + bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; + masked_store_aligned, Sched.ZMM, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; + masked_store_aligned, Sched.YMM, NoMRPattern>, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; + masked_store_aligned, Sched.XMM, NoMRPattern>, EVEX_V128; } } defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + HasAVX512, SchedWriteFMoveLS>, avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + HasAVX512, SchedWriteFMoveLS>, TB, EVEX_CD8<32, CD8VF>; defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + HasAVX512, SchedWriteFMoveLS>, avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + HasAVX512, SchedWriteFMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>, + SchedWriteFMoveLS, 0, null_frag>, avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPS">, + SchedWriteFMoveLS>, TB, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>, + SchedWriteFMoveLS, 0, null_frag>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPD">, + SchedWriteFMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA", 1>, + HasAVX512, SchedWriteVecMoveLS, 1>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA", 1>, + HasAVX512, SchedWriteVecMoveLS, 1>, TB, PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA">, + HasAVX512, SchedWriteVecMoveLS>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA">, + HasAVX512, SchedWriteVecMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XD, REX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>, + SchedWriteVecMoveLS, 1, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>, + SchedWriteVecMoveLS, 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU">, + SchedWriteVecMoveLS>, TB, XS, REX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need @@ -4844,8 +4825,7 @@ defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, SchedWriteVecIMul, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, - SchedWriteVecIMul, HasDQI, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecIMul, HasDQI, 1>, T8; defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul, HasBWI, 1>; defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul, @@ -4989,8 +4969,7 @@ defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, SchedWriteVecALU, HasBWI, 1>; @@ -4999,8 +4978,7 @@ defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, SchedWriteVecALU, HasBWI, 1>, T8; @@ -5009,8 +4987,7 @@ defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, SchedWriteVecALU, HasBWI, 1>; @@ -5019,8 +4996,7 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. let Predicates = [HasDQI, NoVLX] in { @@ -5405,8 +5381,7 @@ multiclass avx512_fp_scalar_round opc, string OpcodeStr,X86VectorVTInfo } multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, SDNode SaeNode, - X86FoldableSchedWrite sched, bit IsCommutable, - string EVEX2VexOvrd> { + X86FoldableSchedWrite sched, bit IsCommutable> { let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar opc, string OpcodeStr,X86VectorVTInfo _, (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]>, - EVEX2VEXOverride { + Sched<[sched]> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5436,8 +5410,7 @@ multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Uses = [MXCSR] in @@ -5474,19 +5447,15 @@ multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, SDNode VecNode, SDNode SaeNode, X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar_sae, + VecNode, SaeNode, sched.PS.Scl, IsCommutable>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar_sae, + VecNode, SaeNode, sched.PD.Scl, IsCommutable>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>; let Predicates = [HasFP16] in { defm SHZ : avx512_fp_scalar_sae, - T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, - NotEVEX2VEXConvertible; + VecNode, SaeNode, sched.PH.Scl, IsCommutable>, + T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>; } } defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, @@ -5506,14 +5475,13 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, - X86FoldableSchedWrite sched, - string EVEX2VEXOvrd> { + X86FoldableSchedWrite sched> { let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]>, EVEX2VEXOverride { + Sched<[sched]> { let isCommutable = 1; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5521,36 +5489,34 @@ multiclass avx512_comutable_binop_s opc, string OpcodeStr, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSS">, TB, XS, + SchedWriteFCmp.Scl>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSD">, TB, XD, + SchedWriteFCmp.Scl>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS, + SchedWriteFCmp.Scl>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD, + SchedWriteFCmp.Scl>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS, - EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, - NotEVEX2VEXConvertible; + SchedWriteFCmp.Scl>, T_MAP5, XS, + EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC; + defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS, - EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, - NotEVEX2VEXConvertible; + SchedWriteFCmp.Scl>, T_MAP5, XS, + EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC; multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, @@ -5820,8 +5786,7 @@ multiclass avx512_fp_scalef_all opc, bits<8> opcScaler, string OpcodeStr EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", - SchedWriteFAdd>, NotEVEX2VEXConvertible; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -5985,11 +5950,9 @@ multiclass avx512_shift_sizes opc, string OpcodeStr, SDNode OpNode, multiclass avx512_shift_types opcd, bits<8> opcq, bits<8> opcw, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched, - bit NotEVEX2VEXConvertibleQ = 0> { + X86SchedWriteWidths sched> { defm D : avx512_shift_sizes; - let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in defm Q : avx512_shift_sizes, REX_W; defm W : avx512_shift_sizes opcw, Format ImmFormR, Format ImmFormM, multiclass avx512_shift_rmi_dq opcd, bits<8> opcq, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched, - bit NotEVEX2VEXConvertibleQ = 0> { + X86SchedWriteWidths sched> { defm D: avx512_shift_rmi_sizes, EVEX_CD8<32, CD8VF>; - let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in defm Q: avx512_shift_rmi_sizes, EVEX_CD8<64, CD8VF>, REX_W; } @@ -6054,7 +6015,7 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV; defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, - SchedWriteVecShiftImm, 1>, + SchedWriteVecShiftImm>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV; @@ -6066,7 +6027,7 @@ defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SchedWriteVecShift>; defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, - SchedWriteVecShift, 1>; + SchedWriteVecShift>; defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SchedWriteVecShift>; @@ -8443,9 +8404,9 @@ multiclass avx512_cvtqq2pd opc, string OpcodeStr, SDPatternOperator OpNo } let Predicates = [HasDQI, HasVLX] in { defm Z128 : avx512_vcvt_fp, EVEX_V128, NotEVEX2VEXConvertible; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256, NotEVEX2VEXConvertible; + MaskOpNode, sched.YMM>, EVEX_V256; } } @@ -8524,11 +8485,10 @@ multiclass avx512_cvtqq2ps_dq2ph opc, string OpcodeStr, SDPatternOperato defm Z128 : avx512_vcvt_fp, - EVEX_V128, NotEVEX2VEXConvertible; + EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256, - NotEVEX2VEXConvertible; + "{y}">, EVEX_V256; // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction // patterns have been disabled with null_frag. @@ -10882,8 +10842,7 @@ defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info, multiclass avx512_shuff_packed_128_common opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, - X86VectorVTInfo CastInfo, - string EVEX2VEXOvrd> { + X86VectorVTInfo CastInfo> { let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable opc, string OpcodeStr, (_.VT (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))))>, - Sched<[sched]>, EVEX2VEXOverride; + Sched<[sched]>; defm rmi : AVX512_maskable opc, string OpcodeStr, (CastInfo.VT (X86Shuf128 _.RC:$src1, (CastInfo.LdFrag addr:$src2), (i8 timm:$src3)))))>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride; + Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable opc, string OpcodeStr, multiclass avx512_shuff_packed_128 opc, - string EVEX2VEXOvrd>{ + AVX512VLVectorVTInfo CastInfo, bits<8> opc>{ let Predicates = [HasAVX512] in defm Z : avx512_shuff_packed_128_common, EVEX_V512; + _.info512, CastInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in defm Z256 : avx512_shuff_packed_128_common, EVEX_V256; + _.info256, CastInfo.info256>, EVEX_V256; } defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256, - avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; + avx512vl_f32_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256, - avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; + avx512vl_f64_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256, - avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; + avx512vl_i32_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256, - avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; + avx512vl_i64_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; multiclass avx512_valign opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ - // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the - // instantiation of this class. let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable, - Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; + Sched<[sched]>; defm rmi : AVX512_maskable, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride<"VPALIGNRrmi">; + Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable, AVX512AIi8Base, EVEX, VVVV, EVEX_V128; // We can't really override the 256-bit version so change it back to unset. - let EVEX2VEXOverride = ? in defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>, AVX512AIi8Base, EVEX, VVVV, EVEX_V256; } @@ -11111,7 +11063,7 @@ let Predicates = [HasVLX, HasBWI] in { defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw", SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>, - EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible; + EVEX_CD8<8, CD8VF>; multiclass avx512_unary_rm opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -13088,12 +13040,10 @@ multiclass avx512_cvtqq2ph opc, string OpcodeStr, SDPatternOperator OpNo let Predicates = [HasFP16, HasVLX] in { defm Z128 : avx512_vcvt_fp, - EVEX_V128, NotEVEX2VEXConvertible; + i128mem, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp, - EVEX_V256, NotEVEX2VEXConvertible; + i256mem, VK4WM>, EVEX_V256; } def : InstAlias opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); - // Used to prevent an explicit EVEX2VEX override for this instruction. - string EVEX2VEXOverride = ?; - - bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion. ExplicitOpPrefix explicitOpPrefix = NoExplicitOpPrefix; bits<2> explicitOpPrefixBits = explicitOpPrefix.Value; // TSFlags layout should be kept in sync with X86BaseInfo.h. diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 132941a..9183bcd 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -66,9 +66,6 @@ class EVEX_CD8 { } class NoCD8 { bits<7> CD8_Scale = 0; } -class EVEX2VEXOverride { - string EVEX2VEXOverride = VEXInstrName; -} class AVX512BIi8Base : TB, PD { Domain ExeDomain = SSEPackedInt; ImmType ImmT = Imm8; @@ -89,7 +86,6 @@ class AVX512PDIi8Base : TB, PD { Domain ExeDomain = SSEPackedDouble; ImmType ImmT = Imm8; } -class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; } class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; } class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; } class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; } diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp index c1ea34d..3a26732 100644 --- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -17,12 +17,23 @@ #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" +#include +#include using namespace llvm; using namespace X86Disassembler; namespace { +const std::map ManualMap = { +#define ENTRY(OLD, NEW) {#OLD, #NEW}, +#include "X86ManualCompressEVEXTables.def" +}; +const std::set NoCompressSet = { +#define NOCOMP(INSN) #INSN, +#include "X86ManualCompressEVEXTables.def" +}; + class X86CompressEVEXTablesEmitter { RecordKeeper &Records; CodeGenTarget Target; @@ -151,13 +162,14 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { Target.getInstructionsByEnumValue(); for (const CodeGenInstruction *Inst : NumberedInstructions) { - const Record *Def = Inst->TheDef; - // Filter non-X86 instructions. - if (!Def->isSubClassOf("X86Inst")) - continue; + const Record *Rec = Inst->TheDef; // _REV instruction should not appear before encoding optimization - if (Def->getName().ends_with("_REV")) + if (!Rec->isSubClassOf("X86Inst") || Rec->getName().ends_with("_REV")) continue; + + if (NoCompressSet.find(Rec->getName()) != NoCompressSet.end()) + continue; + RecognizableInstrBase RI(*Inst); // Add VEX encoded instructions to one of CompressedInsts vectors according @@ -166,25 +178,24 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { CompressedInsts[RI.Opcode].push_back(Inst); // Add relevant EVEX encoded instructions to PreCompressionInsts else if (RI.Encoding == X86Local::EVEX && !RI.HasEVEX_K && !RI.HasEVEX_B && - !RI.HasEVEX_L2 && !Def->getValueAsBit("notEVEX2VEXConvertible")) + !RI.HasEVEX_L2) PreCompressionInsts.push_back(Inst); } - for (const CodeGenInstruction *EVEXInst : PreCompressionInsts) { + for (const CodeGenInstruction *Inst : PreCompressionInsts) { + const Record *Rec = Inst->TheDef; uint64_t Opcode = - getValueFromBitsInit(EVEXInst->TheDef->getValueAsBitsInit("Opcode")); - // For each EVEX instruction look for a VEX match in the appropriate vector - // (instructions with the same opcode) using function object IsMatch. - // Allow EVEX2VEXOverride to explicitly specify a match. + getValueFromBitsInit(Inst->TheDef->getValueAsBitsInit("Opcode")); const CodeGenInstruction *VEXInst = nullptr; - if (!EVEXInst->TheDef->isValueUnset("EVEX2VEXOverride")) { - StringRef AltInstStr = - EVEXInst->TheDef->getValueAsString("EVEX2VEXOverride"); - Record *AltInstRec = Records.getDef(AltInstStr); - assert(AltInstRec && "EVEX2VEXOverride instruction not found!"); - VEXInst = &Target.getInstruction(AltInstRec); + if (ManualMap.find(Rec->getName()) != ManualMap.end()) { + Record *NewRec = Records.getDef(ManualMap.at(Rec->getName())); + assert(NewRec && "Instruction not found!"); + VEXInst = &Target.getInstruction(NewRec); } else { - auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(EVEXInst)); + // For each EVEX instruction look for a VEX match in the appropriate + // vector (instructions with the same opcode) using function object + // IsMatch. + auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst)); if (Match != CompressedInsts[Opcode].end()) VEXInst = *Match; } @@ -193,10 +204,10 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { continue; // In case a match is found add new entry to the appropriate table - if (EVEXInst->TheDef->getValueAsBit("hasVEX_L")) - EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1} + if (Rec->getValueAsBit("hasVEX_L")) + EVEX2VEX256.push_back(std::make_pair(Inst, VEXInst)); // {0,1} else - EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0} + EVEX2VEX128.push_back(std::make_pair(Inst, VEXInst)); // {0,0} } // Print both tables diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def new file mode 100644 index 0000000..0da32f9 --- /dev/null +++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def @@ -0,0 +1,88 @@ +//===- X86ManualCompressEVEXTables.def ---------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// This file defines all the entries in X86 EVEX compression tables that need +// special handling. +//===----------------------------------------------------------------------===// + +#ifndef NOCOMP +#define NOCOMP(INSN) +#endif +NOCOMP(VCVTQQ2PDZ128rr) +NOCOMP(VCVTQQ2PSZ128rm) +NOCOMP(VCVTQQ2PSZ128rr) +NOCOMP(VDBPSADBWZ128rmi) +NOCOMP(VDBPSADBWZ128rri) +NOCOMP(VPMAXSQZ128rm) +NOCOMP(VPMAXSQZ128rr) +NOCOMP(VPMAXUQZ128rm) +NOCOMP(VPMAXUQZ128rr) +NOCOMP(VPMINSQZ128rm) +NOCOMP(VPMINSQZ128rr) +NOCOMP(VPMINUQZ128rm) +NOCOMP(VPMINUQZ128rr) +NOCOMP(VPMULLQZ128rm) +NOCOMP(VPMULLQZ128rr) +NOCOMP(VPSRAQZ128ri) +NOCOMP(VPSRAQZ128rm) +NOCOMP(VPSRAQZ128rr) +NOCOMP(VSCALEFPSZ128rm) +NOCOMP(VDBPSADBWZ256rmi) +NOCOMP(VDBPSADBWZ256rri) +NOCOMP(VPMAXSQZ256rm) +NOCOMP(VPMAXSQZ256rr) +NOCOMP(VPMAXUQZ256rm) +NOCOMP(VPMAXUQZ256rr) +NOCOMP(VPMINSQZ256rm) +NOCOMP(VPMINSQZ256rr) +NOCOMP(VPMINUQZ256rm) +NOCOMP(VPMINUQZ256rr) +NOCOMP(VPMULLQZ256rm) +NOCOMP(VPMULLQZ256rr) +NOCOMP(VPSRAQZ256ri) +NOCOMP(VPSRAQZ256rm) +NOCOMP(VPSRAQZ256rr) +NOCOMP(VSCALEFPSZ256rm) +#undef NOCOMP + +#ifndef ENTRY +#define ENTRY(OLD, NEW) +#endif +ENTRY(VALIGNDZ128rmi, VPALIGNRrmi) +ENTRY(VALIGNDZ128rri, VPALIGNRrri) +ENTRY(VALIGNQZ128rmi, VPALIGNRrmi) +ENTRY(VALIGNQZ128rri, VPALIGNRrri) +ENTRY(VMAXSDZrm, VMAXSDrm) +ENTRY(VMAXSDZrr, VMAXSDrr) +ENTRY(VMAXSSZrm, VMAXSSrm) +ENTRY(VMAXSSZrr, VMAXSSrr) +ENTRY(VMINSDZrm, VMINSDrm) +ENTRY(VMINSDZrr, VMINSDrr) +ENTRY(VMINSSZrm, VMINSSrm) +ENTRY(VMINSSZrr, VMINSSrr) +ENTRY(VMOVDQU16Z128mr, VMOVDQUmr) +ENTRY(VMOVDQU16Z128rm, VMOVDQUrm) +ENTRY(VMOVDQU16Z128rr, VMOVDQUrr) +ENTRY(VMOVDQU8Z128mr, VMOVDQUmr) +ENTRY(VMOVDQU8Z128rm, VMOVDQUrm) +ENTRY(VMOVDQU8Z128rr, VMOVDQUrr) +ENTRY(VMOVDQU16Z256mr, VMOVDQUYmr) +ENTRY(VMOVDQU16Z256rm, VMOVDQUYrm) +ENTRY(VMOVDQU16Z256rr, VMOVDQUYrr) +ENTRY(VMOVDQU8Z256mr, VMOVDQUYmr) +ENTRY(VMOVDQU8Z256rm, VMOVDQUYrm) +ENTRY(VMOVDQU8Z256rr, VMOVDQUYrr) +ENTRY(VSHUFF32X4Z256rmi, VPERM2F128rm) +ENTRY(VSHUFF32X4Z256rri, VPERM2F128rr) +ENTRY(VSHUFF64X2Z256rmi, VPERM2F128rm) +ENTRY(VSHUFF64X2Z256rri, VPERM2F128rr) +ENTRY(VSHUFI32X4Z256rmi, VPERM2I128rm) +ENTRY(VSHUFI32X4Z256rri, VPERM2I128rr) +ENTRY(VSHUFI64X2Z256rmi, VPERM2I128rm) +ENTRY(VSHUFI64X2Z256rri, VPERM2I128rr) +#undef ENTRY -- cgit v1.1 From 04a7ec610ee1ad869e402c327984cb649be86f3c Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 17:07:39 +0800 Subject: [X86][NFC] Remove VEX_W1X after 80dbf60 --- llvm/lib/Target/X86/X86InstrAVX512.td | 18 +- llvm/lib/Target/X86/X86InstrFormats.td | 2 - llvm/lib/Target/X86/X86InstrUtils.td | 2 - .../TableGen/X86CompressEVEXTablesEmitter.cpp | 45 ++-- llvm/utils/TableGen/X86FoldTablesEmitter.cpp | 6 +- .../utils/TableGen/X86ManualCompressEVEXTables.def | 243 +++++++++++++++++++++ 6 files changed, 271 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 6664671..fe7d90f 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -448,7 +448,7 @@ multiclass vinsert_for_type, X86VectorVTInfo< 4, EltVT64, VR256X>, null_frag, vinsert128_insert, sched>, - VEX_W1X, EVEX_V256; + EVEX_V256, REX_W; // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { @@ -750,7 +750,7 @@ multiclass vextract_for_type, X86VectorVTInfo< 2, EltVT64, VR128X>, null_frag, vextract128_extract, SchedRR, SchedMR>, - VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { @@ -1161,7 +1161,7 @@ multiclass avx512_fp_broadcast_ss opc, string OpcodeStr, defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss", avx512vl_f32_info>; defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd", - avx512vl_f64_info>, VEX_W1X; + avx512vl_f64_info>, REX_W; multiclass avx512_int_broadcast_reg opc, SchedWrite SchedRR, X86VectorVTInfo _, SDPatternOperator OpNode, @@ -1267,7 +1267,7 @@ defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", avx512vl_i32_info, HasAVX512, 1>; defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", - avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; + avx512vl_i64_info, HasAVX512, 1>, REX_W; multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, SDPatternOperator OpNode, @@ -1460,11 +1460,11 @@ let Predicates = [HasBF16, HasVLX] in let Predicates = [HasVLX, HasDQI] in { defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, - EVEX_V256, EVEX_CD8<64, CD8VT2>; + X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, - EVEX_V256, EVEX_CD8<64, CD8VT2>; + X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK4WM:$mask, @@ -6396,7 +6396,7 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, avx512vl_i32_info>; let ExeDomain = SSEPackedDouble in defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, - avx512vl_i64_info>, VEX_W1X; + avx512vl_i64_info>, REX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index 9f0b732..8798b13 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -247,8 +247,6 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bit hasREPPrefix = 0; // Does this inst have a REP prefix? bits<2> OpEncBits = OpEnc.Value; bit IgnoresW = 0; // Does this inst ignore REX_W field? - bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX - // instruction with VEX.W == 0. bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 9183bcd..f4ae158 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -43,8 +43,6 @@ class XOP { Encoding OpEnc = EncXOP; } class VEX { Encoding OpEnc = EncVEX; } class EVEX { Encoding OpEnc = EncEVEX; } class WIG { bit IgnoresW = 1; } -// Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX. -class VEX_W1X { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class VVVV { bit hasVEX_4V = 1; } diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp index 3a26732..a45e87a 100644 --- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -95,34 +95,23 @@ static inline uint64_t getValueFromBitsInit(const BitsInit *B) { return Value; } -// Function object - Operator() returns true if the given VEX instruction -// matches the EVEX instruction of this object. class IsMatch { - const CodeGenInstruction *EVEXInst; + const CodeGenInstruction *OldInst; public: - IsMatch(const CodeGenInstruction *EVEXInst) : EVEXInst(EVEXInst) {} - - bool operator()(const CodeGenInstruction *VEXInst) { - RecognizableInstrBase VEXRI(*VEXInst); - RecognizableInstrBase EVEXRI(*EVEXInst); - bool VEX_W = VEXRI.HasREX_W; - bool EVEX_W = EVEXRI.HasREX_W; - bool VEX_WIG = VEXRI.IgnoresW; - bool EVEX_WIG = EVEXRI.IgnoresW; - bool EVEX_W1_VEX_W0 = EVEXInst->TheDef->getValueAsBit("EVEX_W1_VEX_W0"); - - if (VEXRI.IsCodeGenOnly != EVEXRI.IsCodeGenOnly || - // VEX/EVEX fields - VEXRI.OpPrefix != EVEXRI.OpPrefix || VEXRI.OpMap != EVEXRI.OpMap || - VEXRI.HasVEX_4V != EVEXRI.HasVEX_4V || - VEXRI.HasVEX_L != EVEXRI.HasVEX_L || - // Match is allowed if either is VEX_WIG, or they match, or EVEX - // is VEX_W1X and VEX is VEX_W0. - (!(VEX_WIG || (!EVEX_WIG && EVEX_W == VEX_W) || - (EVEX_W1_VEX_W0 && EVEX_W && !VEX_W))) || - // Instruction's format - VEXRI.Form != EVEXRI.Form) + IsMatch(const CodeGenInstruction *OldInst) : OldInst(OldInst) {} + + bool operator()(const CodeGenInstruction *NewInst) { + RecognizableInstrBase NewRI(*NewInst); + RecognizableInstrBase OldRI(*OldInst); + + // Return false if any of the following fields of does not match. + if (std::make_tuple(OldRI.IsCodeGenOnly, OldRI.OpMap, NewRI.OpPrefix, + OldRI.HasVEX_4V, OldRI.HasVEX_L, OldRI.HasREX_W, + OldRI.Form) != + std::make_tuple(NewRI.IsCodeGenOnly, NewRI.OpMap, OldRI.OpPrefix, + NewRI.HasVEX_4V, NewRI.HasVEX_L, NewRI.HasREX_W, + NewRI.Form)) return false; // This is needed for instructions with intrinsic version (_Int). @@ -131,9 +120,9 @@ public: // Also for instructions that their EVEX version was upgraded to work with // k-registers. For example VPCMPEQBrm (xmm output register) and // VPCMPEQBZ128rm (k register output register). - for (unsigned i = 0, e = EVEXInst->Operands.size(); i < e; i++) { - Record *OpRec1 = EVEXInst->Operands[i].Rec; - Record *OpRec2 = VEXInst->Operands[i].Rec; + for (unsigned i = 0, e = OldInst->Operands.size(); i < e; i++) { + Record *OpRec1 = OldInst->Operands[i].Rec; + Record *OpRec2 = NewInst->Operands[i].Rec; if (OpRec1 == OpRec2) continue; diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 101b75e..8a860d0 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -374,8 +374,7 @@ public: RegRI.HasEVEX_L2, RegRI.HasEVEX_NF, RegRec->getValueAsBit("hasEVEX_RC"), RegRec->getValueAsBit("hasLockPrefix"), - RegRec->getValueAsBit("hasNoTrackPrefix"), - RegRec->getValueAsBit("EVEX_W1_VEX_W0")) != + RegRec->getValueAsBit("hasNoTrackPrefix")) != std::make_tuple(MemRI.Encoding, MemRI.Opcode, MemRI.OpPrefix, MemRI.OpMap, MemRI.OpSize, MemRI.AdSize, MemRI.HasREX_W, MemRI.HasVEX_4V, MemRI.HasVEX_L, MemRI.IgnoresVEX_L, @@ -383,8 +382,7 @@ public: MemRI.HasEVEX_L2, MemRI.HasEVEX_NF, MemRec->getValueAsBit("hasEVEX_RC"), MemRec->getValueAsBit("hasLockPrefix"), - MemRec->getValueAsBit("hasNoTrackPrefix"), - MemRec->getValueAsBit("EVEX_W1_VEX_W0"))) + MemRec->getValueAsBit("hasNoTrackPrefix"))) return false; // Make sure the sizes of the operands of both instructions suit each other. diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def index 0da32f9..58ca10e 100644 --- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def +++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def @@ -85,4 +85,247 @@ ENTRY(VSHUFI32X4Z256rmi, VPERM2I128rm) ENTRY(VSHUFI32X4Z256rri, VPERM2I128rr) ENTRY(VSHUFI64X2Z256rmi, VPERM2I128rm) ENTRY(VSHUFI64X2Z256rri, VPERM2I128rr) +// W bit does not match +ENTRY(VADDPDZ128rm, VADDPDrm) +ENTRY(VADDPDZ128rr, VADDPDrr) +ENTRY(VADDSDZrm, VADDSDrm) +ENTRY(VADDSDZrm_Int, VADDSDrm_Int) +ENTRY(VADDSDZrr, VADDSDrr) +ENTRY(VADDSDZrr_Int, VADDSDrr_Int) +ENTRY(VANDNPDZ128rm, VANDNPDrm) +ENTRY(VANDNPDZ128rr, VANDNPDrr) +ENTRY(VANDPDZ128rm, VANDPDrm) +ENTRY(VANDPDZ128rr, VANDPDrr) +ENTRY(VCOMISDZrm, VCOMISDrm) +ENTRY(VCOMISDZrm_Int, VCOMISDrm_Int) +ENTRY(VCOMISDZrr, VCOMISDrr) +ENTRY(VCOMISDZrr_Int, VCOMISDrr_Int) +ENTRY(VCVTPD2DQZ128rm, VCVTPD2DQrm) +ENTRY(VCVTPD2DQZ128rr, VCVTPD2DQrr) +ENTRY(VCVTPD2PSZ128rm, VCVTPD2PSrm) +ENTRY(VCVTPD2PSZ128rr, VCVTPD2PSrr) +ENTRY(VCVTSD2SSZrm, VCVTSD2SSrm) +ENTRY(VCVTSD2SSZrm_Int, VCVTSD2SSrm_Int) +ENTRY(VCVTSD2SSZrr, VCVTSD2SSrr) +ENTRY(VCVTSD2SSZrr_Int, VCVTSD2SSrr_Int) +ENTRY(VCVTTPD2DQZ128rm, VCVTTPD2DQrm) +ENTRY(VCVTTPD2DQZ128rr, VCVTTPD2DQrr) +ENTRY(VDIVPDZ128rm, VDIVPDrm) +ENTRY(VDIVPDZ128rr, VDIVPDrr) +ENTRY(VDIVSDZrm, VDIVSDrm) +ENTRY(VDIVSDZrm_Int, VDIVSDrm_Int) +ENTRY(VDIVSDZrr, VDIVSDrr) +ENTRY(VDIVSDZrr_Int, VDIVSDrr_Int) +ENTRY(VMAXCPDZ128rm, VMAXCPDrm) +ENTRY(VMAXCPDZ128rr, VMAXCPDrr) +ENTRY(VMAXCSDZrm, VMAXCSDrm) +ENTRY(VMAXCSDZrr, VMAXCSDrr) +ENTRY(VMAXPDZ128rm, VMAXPDrm) +ENTRY(VMAXPDZ128rr, VMAXPDrr) +ENTRY(VMAXSDZrm_Int, VMAXSDrm_Int) +ENTRY(VMAXSDZrr_Int, VMAXSDrr_Int) +ENTRY(VMINCPDZ128rm, VMINCPDrm) +ENTRY(VMINCPDZ128rr, VMINCPDrr) +ENTRY(VMINCSDZrm, VMINCSDrm) +ENTRY(VMINCSDZrr, VMINCSDrr) +ENTRY(VMINPDZ128rm, VMINPDrm) +ENTRY(VMINPDZ128rr, VMINPDrr) +ENTRY(VMINSDZrm_Int, VMINSDrm_Int) +ENTRY(VMINSDZrr_Int, VMINSDrr_Int) +ENTRY(VMOVAPDZ128mr, VMOVAPDmr) +ENTRY(VMOVAPDZ128rm, VMOVAPDrm) +ENTRY(VMOVAPDZ128rr, VMOVAPDrr) +ENTRY(VMOVDDUPZ128rm, VMOVDDUPrm) +ENTRY(VMOVDDUPZ128rr, VMOVDDUPrr) +ENTRY(VMOVDQA64Z128mr, VMOVDQAmr) +ENTRY(VMOVDQA64Z128rm, VMOVDQArm) +ENTRY(VMOVDQA64Z128rr, VMOVDQArr) +ENTRY(VMOVDQU64Z128mr, VMOVDQUmr) +ENTRY(VMOVDQU64Z128rm, VMOVDQUrm) +ENTRY(VMOVDQU64Z128rr, VMOVDQUrr) +ENTRY(VMOVHPDZ128mr, VMOVHPDmr) +ENTRY(VMOVHPDZ128rm, VMOVHPDrm) +ENTRY(VMOVLPDZ128mr, VMOVLPDmr) +ENTRY(VMOVLPDZ128rm, VMOVLPDrm) +ENTRY(VMOVNTPDZ128mr, VMOVNTPDmr) +ENTRY(VMOVPQI2QIZmr, VMOVPQI2QImr) +ENTRY(VMOVPQI2QIZrr, VMOVPQI2QIrr) +ENTRY(VMOVQI2PQIZrm, VMOVQI2PQIrm) +ENTRY(VMOVSDZmr, VMOVSDmr) +ENTRY(VMOVSDZrm, VMOVSDrm) +ENTRY(VMOVSDZrm_alt, VMOVSDrm_alt) +ENTRY(VMOVSDZrr, VMOVSDrr) +ENTRY(VMOVUPDZ128mr, VMOVUPDmr) +ENTRY(VMOVUPDZ128rm, VMOVUPDrm) +ENTRY(VMOVUPDZ128rr, VMOVUPDrr) +ENTRY(VMOVZPQILo2PQIZrr, VMOVZPQILo2PQIrr) +ENTRY(VMULPDZ128rm, VMULPDrm) +ENTRY(VMULPDZ128rr, VMULPDrr) +ENTRY(VMULSDZrm, VMULSDrm) +ENTRY(VMULSDZrm_Int, VMULSDrm_Int) +ENTRY(VMULSDZrr, VMULSDrr) +ENTRY(VMULSDZrr_Int, VMULSDrr_Int) +ENTRY(VORPDZ128rm, VORPDrm) +ENTRY(VORPDZ128rr, VORPDrr) +ENTRY(VPADDQZ128rm, VPADDQrm) +ENTRY(VPADDQZ128rr, VPADDQrr) +ENTRY(VPANDNQZ128rm, VPANDNrm) +ENTRY(VPANDNQZ128rr, VPANDNrr) +ENTRY(VPANDQZ128rm, VPANDrm) +ENTRY(VPANDQZ128rr, VPANDrr) +ENTRY(VPERMILPDZ128mi, VPERMILPDmi) +ENTRY(VPERMILPDZ128ri, VPERMILPDri) +ENTRY(VPERMILPDZ128rm, VPERMILPDrm) +ENTRY(VPERMILPDZ128rr, VPERMILPDrr) +ENTRY(VPMULDQZ128rm, VPMULDQrm) +ENTRY(VPMULDQZ128rr, VPMULDQrr) +ENTRY(VPMULUDQZ128rm, VPMULUDQrm) +ENTRY(VPMULUDQZ128rr, VPMULUDQrr) +ENTRY(VPORQZ128rm, VPORrm) +ENTRY(VPORQZ128rr, VPORrr) +ENTRY(VPSLLQZ128ri, VPSLLQri) +ENTRY(VPSLLQZ128rm, VPSLLQrm) +ENTRY(VPSLLQZ128rr, VPSLLQrr) +ENTRY(VPSRLQZ128ri, VPSRLQri) +ENTRY(VPSRLQZ128rm, VPSRLQrm) +ENTRY(VPSRLQZ128rr, VPSRLQrr) +ENTRY(VPSUBQZ128rm, VPSUBQrm) +ENTRY(VPSUBQZ128rr, VPSUBQrr) +ENTRY(VPUNPCKHQDQZ128rm, VPUNPCKHQDQrm) +ENTRY(VPUNPCKHQDQZ128rr, VPUNPCKHQDQrr) +ENTRY(VPUNPCKLQDQZ128rm, VPUNPCKLQDQrm) +ENTRY(VPUNPCKLQDQZ128rr, VPUNPCKLQDQrr) +ENTRY(VPXORQZ128rm, VPXORrm) +ENTRY(VPXORQZ128rr, VPXORrr) +ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDm) +ENTRY(VRNDSCALEPDZ128rri, VROUNDPDr) +ENTRY(VRNDSCALESDZm, VROUNDSDm) +ENTRY(VRNDSCALESDZm_Int, VROUNDSDm_Int) +ENTRY(VRNDSCALESDZr, VROUNDSDr) +ENTRY(VRNDSCALESDZr_Int, VROUNDSDr_Int) +ENTRY(VSHUFPDZ128rmi, VSHUFPDrmi) +ENTRY(VSHUFPDZ128rri, VSHUFPDrri) +ENTRY(VSQRTPDZ128m, VSQRTPDm) +ENTRY(VSQRTPDZ128r, VSQRTPDr) +ENTRY(VSQRTSDZm, VSQRTSDm) +ENTRY(VSQRTSDZm_Int, VSQRTSDm_Int) +ENTRY(VSQRTSDZr, VSQRTSDr) +ENTRY(VSQRTSDZr_Int, VSQRTSDr_Int) +ENTRY(VSUBPDZ128rm, VSUBPDrm) +ENTRY(VSUBPDZ128rr, VSUBPDrr) +ENTRY(VSUBSDZrm, VSUBSDrm) +ENTRY(VSUBSDZrm_Int, VSUBSDrm_Int) +ENTRY(VSUBSDZrr, VSUBSDrr) +ENTRY(VSUBSDZrr_Int, VSUBSDrr_Int) +ENTRY(VUCOMISDZrm, VUCOMISDrm) +ENTRY(VUCOMISDZrm_Int, VUCOMISDrm_Int) +ENTRY(VUCOMISDZrr, VUCOMISDrr) +ENTRY(VUCOMISDZrr_Int, VUCOMISDrr_Int) +ENTRY(VUNPCKHPDZ128rm, VUNPCKHPDrm) +ENTRY(VUNPCKHPDZ128rr, VUNPCKHPDrr) +ENTRY(VUNPCKLPDZ128rm, VUNPCKLPDrm) +ENTRY(VUNPCKLPDZ128rr, VUNPCKLPDrr) +ENTRY(VXORPDZ128rm, VXORPDrm) +ENTRY(VXORPDZ128rr, VXORPDrr) +ENTRY(VADDPDZ256rm, VADDPDYrm) +ENTRY(VADDPDZ256rr, VADDPDYrr) +ENTRY(VANDNPDZ256rm, VANDNPDYrm) +ENTRY(VANDNPDZ256rr, VANDNPDYrr) +ENTRY(VANDPDZ256rm, VANDPDYrm) +ENTRY(VANDPDZ256rr, VANDPDYrr) +ENTRY(VCVTPD2DQZ256rm, VCVTPD2DQYrm) +ENTRY(VCVTPD2DQZ256rr, VCVTPD2DQYrr) +ENTRY(VCVTPD2PSZ256rm, VCVTPD2PSYrm) +ENTRY(VCVTPD2PSZ256rr, VCVTPD2PSYrr) +ENTRY(VCVTTPD2DQZ256rm, VCVTTPD2DQYrm) +ENTRY(VCVTTPD2DQZ256rr, VCVTTPD2DQYrr) +ENTRY(VDIVPDZ256rm, VDIVPDYrm) +ENTRY(VDIVPDZ256rr, VDIVPDYrr) +ENTRY(VEXTRACTF64x2Z256mr, VEXTRACTF128mr) +ENTRY(VEXTRACTF64x2Z256rr, VEXTRACTF128rr) +ENTRY(VEXTRACTI64x2Z256mr, VEXTRACTI128mr) +ENTRY(VEXTRACTI64x2Z256rr, VEXTRACTI128rr) +ENTRY(VINSERTF64x2Z256rm, VINSERTF128rm) +ENTRY(VINSERTF64x2Z256rr, VINSERTF128rr) +ENTRY(VINSERTI64x2Z256rm, VINSERTI128rm) +ENTRY(VINSERTI64x2Z256rr, VINSERTI128rr) +ENTRY(VMAXCPDZ256rm, VMAXCPDYrm) +ENTRY(VMAXCPDZ256rr, VMAXCPDYrr) +ENTRY(VMAXPDZ256rm, VMAXPDYrm) +ENTRY(VMAXPDZ256rr, VMAXPDYrr) +ENTRY(VMINCPDZ256rm, VMINCPDYrm) +ENTRY(VMINCPDZ256rr, VMINCPDYrr) +ENTRY(VMINPDZ256rm, VMINPDYrm) +ENTRY(VMINPDZ256rr, VMINPDYrr) +ENTRY(VMOVAPDZ256mr, VMOVAPDYmr) +ENTRY(VMOVAPDZ256rm, VMOVAPDYrm) +ENTRY(VMOVAPDZ256rr, VMOVAPDYrr) +ENTRY(VMOVDDUPZ256rm, VMOVDDUPYrm) +ENTRY(VMOVDDUPZ256rr, VMOVDDUPYrr) +ENTRY(VMOVDQA64Z256mr, VMOVDQAYmr) +ENTRY(VMOVDQA64Z256rm, VMOVDQAYrm) +ENTRY(VMOVDQA64Z256rr, VMOVDQAYrr) +ENTRY(VMOVDQU64Z256mr, VMOVDQUYmr) +ENTRY(VMOVDQU64Z256rm, VMOVDQUYrm) +ENTRY(VMOVDQU64Z256rr, VMOVDQUYrr) +ENTRY(VMOVNTPDZ256mr, VMOVNTPDYmr) +ENTRY(VMOVUPDZ256mr, VMOVUPDYmr) +ENTRY(VMOVUPDZ256rm, VMOVUPDYrm) +ENTRY(VMOVUPDZ256rr, VMOVUPDYrr) +ENTRY(VMULPDZ256rm, VMULPDYrm) +ENTRY(VMULPDZ256rr, VMULPDYrr) +ENTRY(VORPDZ256rm, VORPDYrm) +ENTRY(VORPDZ256rr, VORPDYrr) +ENTRY(VPADDQZ256rm, VPADDQYrm) +ENTRY(VPADDQZ256rr, VPADDQYrr) +ENTRY(VPANDNQZ256rm, VPANDNYrm) +ENTRY(VPANDNQZ256rr, VPANDNYrr) +ENTRY(VPANDQZ256rm, VPANDYrm) +ENTRY(VPANDQZ256rr, VPANDYrr) +ENTRY(VPERMILPDZ256mi, VPERMILPDYmi) +ENTRY(VPERMILPDZ256ri, VPERMILPDYri) +ENTRY(VPERMILPDZ256rm, VPERMILPDYrm) +ENTRY(VPERMILPDZ256rr, VPERMILPDYrr) +ENTRY(VPMULDQZ256rm, VPMULDQYrm) +ENTRY(VPMULDQZ256rr, VPMULDQYrr) +ENTRY(VPMULUDQZ256rm, VPMULUDQYrm) +ENTRY(VPMULUDQZ256rr, VPMULUDQYrr) +ENTRY(VPORQZ256rm, VPORYrm) +ENTRY(VPORQZ256rr, VPORYrr) +ENTRY(VPSLLQZ256ri, VPSLLQYri) +ENTRY(VPSLLQZ256rm, VPSLLQYrm) +ENTRY(VPSLLQZ256rr, VPSLLQYrr) +ENTRY(VPSRLQZ256ri, VPSRLQYri) +ENTRY(VPSRLQZ256rm, VPSRLQYrm) +ENTRY(VPSRLQZ256rr, VPSRLQYrr) +ENTRY(VPSUBQZ256rm, VPSUBQYrm) +ENTRY(VPSUBQZ256rr, VPSUBQYrr) +ENTRY(VPUNPCKHQDQZ256rm, VPUNPCKHQDQYrm) +ENTRY(VPUNPCKHQDQZ256rr, VPUNPCKHQDQYrr) +ENTRY(VPUNPCKLQDQZ256rm, VPUNPCKLQDQYrm) +ENTRY(VPUNPCKLQDQZ256rr, VPUNPCKLQDQYrr) +ENTRY(VPXORQZ256rm, VPXORYrm) +ENTRY(VPXORQZ256rr, VPXORYrr) +ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYm) +ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYr) +ENTRY(VSHUFPDZ256rmi, VSHUFPDYrmi) +ENTRY(VSHUFPDZ256rri, VSHUFPDYrri) +ENTRY(VSQRTPDZ256m, VSQRTPDYm) +ENTRY(VSQRTPDZ256r, VSQRTPDYr) +ENTRY(VSUBPDZ256rm, VSUBPDYrm) +ENTRY(VSUBPDZ256rr, VSUBPDYrr) +ENTRY(VUNPCKHPDZ256rm, VUNPCKHPDYrm) +ENTRY(VUNPCKHPDZ256rr, VUNPCKHPDYrr) +ENTRY(VUNPCKLPDZ256rm, VUNPCKLPDYrm) +ENTRY(VUNPCKLPDZ256rr, VUNPCKLPDYrr) +ENTRY(VXORPDZ256rm, VXORPDYrm) +ENTRY(VXORPDZ256rr, VXORPDYrr) +ENTRY(VPBROADCASTQZ128rm, VPBROADCASTQrm) +ENTRY(VPBROADCASTQZ128rr, VPBROADCASTQrr) +ENTRY(VBROADCASTF64X2Z128rm, VBROADCASTF128rm) +ENTRY(VBROADCASTI64X2Z128rm, VBROADCASTI128rm) +ENTRY(VBROADCASTSDZ256rm, VBROADCASTSDYrm) +ENTRY(VBROADCASTSDZ256rr, VBROADCASTSDYrr) +ENTRY(VPBROADCASTQZ256rm, VPBROADCASTQYrm) +ENTRY(VPBROADCASTQZ256rr, VPBROADCASTQYrr) #undef ENTRY -- cgit v1.1 From b2246cf73e10c38aefffd923e4b53a1975f45909 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Sat, 6 Jan 2024 10:18:27 +0100 Subject: Bazel port for a5902a4d2425ac083f1530719e35b5c562cb1e60 --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 6d16230..1110daa 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2187,7 +2187,7 @@ llvm_target_lib_list = [lib for lib in [ ("-gen-callingconv", "lib/Target/X86/X86GenCallingConv.inc"), ("-gen-subtarget", "lib/Target/X86/X86GenSubtargetInfo.inc"), ("-gen-x86-fold-tables -asmwriternum=1", "lib/Target/X86/X86GenFoldTables.inc"), - ("-gen-x86-EVEX2VEX-tables", "lib/Target/X86/X86GenEVEX2VEXTables.inc"), + ("-gen-x86-compress-evex-tables", "lib/Target/X86/X86GenCompressEVEXTables.inc"), ("-gen-exegesis", "lib/Target/X86/X86GenExegesis.inc"), ("-gen-x86-mnemonic-tables -asmwriternum=1", "lib/Target/X86/X86GenMnemonicTables.inc"), ], -- cgit v1.1 From ab073cbccb6e79d8b65a286e8948bc1f07c7c09b Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Sat, 6 Jan 2024 01:34:20 -0800 Subject: Add requires darwin to verify-no-file.yaml (#77188) --- llvm/test/tools/llvm-dwarfdump/AArch64/verify-no-file.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/tools/llvm-dwarfdump/AArch64/verify-no-file.yaml b/llvm/test/tools/llvm-dwarfdump/AArch64/verify-no-file.yaml index 1327cc2..808cc7b 100644 --- a/llvm/test/tools/llvm-dwarfdump/AArch64/verify-no-file.yaml +++ b/llvm/test/tools/llvm-dwarfdump/AArch64/verify-no-file.yaml @@ -1,5 +1,6 @@ # RUN: yaml2obj %s -o %t.o # RUN: llvm-dwarfdump -arch arm64 --debug-line --verify %t.o 2>&1 | FileCheck %s +# REQUIRES: system-darwin # CHECK-NOT: error: .debug_line[0x{{[0-9a-f]+}}][0] has invalid file index 1 (valid values are [1,0]): --- !mach-o -- cgit v1.1 From 5b33cff39753c790ecc6847435664592abe40415 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Sat, 6 Jan 2024 11:17:01 +0100 Subject: [mlir][gpu] Add Support for Cluster of Thread Blocks in `gpu.launch` (#76924) --- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 53 +++++++++++-- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 88 +++++++++++++++++++--- .../lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 20 +++-- .../Conversion/SCFToGPU/no_blocks_no_threads.mlir | 4 +- mlir/test/Dialect/GPU/invalid.mlir | 2 +- mlir/test/Dialect/GPU/outlining.mlir | 74 ++++++++++++++++++ 6 files changed, 219 insertions(+), 22 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index efef61b..8d4a110 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -678,6 +678,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [ Arguments<(ins Variadic:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, + Optional:$clusterSizeX, + Optional:$clusterSizeY, + Optional:$clusterSizeZ, Optional:$dynamicSharedMemorySize)>, Results<(outs Optional:$asyncToken)> { let summary = "GPU kernel launch operation"; @@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [ to the amount of dynamic shared memory a kernel's workgroup should be allocated; when this operand is not present, a zero size is assumed. - The body region has at least _twelve_ arguments, grouped as follows: + The body region has at least _twelve_ arguments, or _eighteen_ if cluster + dimensions are present, grouped as follows: + - three optional arguments that contain cluster identifiers along x,y,z + dimensions; - three arguments that contain block identifiers along x,y,z dimensions; - three arguments that contain thread identifiers along x,y,z dimensions; - operands of the `gpu.launch` operation as is (i.e. the operands for @@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ ``` operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )? + ( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )? `blocks` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment (dynamic_shared_memory_size ssa-use)? @@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [ // Assuming %val1 is defined outside the gpu.launch region. %42 = load %workgroup[%bx] : memref<32xf32, 3> } + + // Launch with clusters. + gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2) + blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5) + threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8) + { + // Cluster, block and thread identifiers, as well as cluster/block/grid + // sizes are immediately usable inside body region. + "some_op"(%cx, %bx, %tx) : (index, index, index) -> () + } ``` Rationale: using operation/block arguments gives analyses a clear way of @@ -784,7 +801,10 @@ def GPU_LaunchOp : GPU_Op<"launch", [ CArg<"Type", "nullptr">:$asyncTokenType, CArg<"ValueRange", "{}">:$asyncDependencies, CArg<"TypeRange", "{}">:$workgroupAttributions, - CArg<"TypeRange", "{}">:$privateAttributions)> + CArg<"TypeRange", "{}">:$privateAttributions, + CArg<"Value", "nullptr">:$clusterSizeX, + CArg<"Value", "nullptr">:$clusterSizeY, + CArg<"Value", "nullptr">:$clusterSizeZ)> ]; let extraClassDeclaration = [{ @@ -792,17 +812,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [ KernelDim3 getBlockIds(); /// Get the SSA values corresponding to kernel thread identifiers. KernelDim3 getThreadIds(); + /// Get the SSA values corresponding to kernel cluster identifiers. + std::optional getClusterIds(); /// Get the SSA values corresponding to kernel grid size. KernelDim3 getGridSize(); /// Get the SSA values corresponding to kernel block size. KernelDim3 getBlockSize(); + /// Get the SSA values corresponding to kernel cluster size. + std::optional getClusterSize(); /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); + /// Get the SSA values passed as operands to specify the cluster size. + std::optional getClusterSizeOperandValues(); static StringRef getBlocksKeyword() { return "blocks"; } + static StringRef getClustersKeyword() { return "clusters"; } static StringRef getThreadsKeyword() { return "threads"; } static StringRef getDynamicSharedMemorySizeKeyword() { return "dynamic_shared_memory_size"; @@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// placed in the leading positions of the argument list. static constexpr unsigned kNumConfigRegionAttributes = 12; + /// Returns true if cluster size is specified. + bool hasClusterSize() { + if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ()) + return true; + return false; + } + /// Returns the number of operands including cluster size + unsigned getNumConfigOperands() { + return kNumConfigOperands + (hasClusterSize() ? 3 : 0); + } + /// Returns the number of region attributes including cluster size + unsigned getNumConfigRegionAttributes() { + return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0); + } + /// Returns the keywords used in the custom syntax for this Op. static StringRef getWorkgroupKeyword() { return "workgroup"; } static StringRef getPrivateKeyword() { return "private"; } @@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// the workgroup memory ArrayRef getWorkgroupAttributions() { auto begin = - std::next(getBody().args_begin(), kNumConfigRegionAttributes); + std::next(getBody().args_begin(), getNumConfigRegionAttributes()); auto end = std::next(begin, getNumWorkgroupAttributions()); return {begin, end}; } @@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// Returns the number of buffers located in the private memory. unsigned getNumPrivateAttributions() { - return getBody().getNumArguments() - kNumConfigRegionAttributes - + return getBody().getNumArguments() - getNumConfigRegionAttributes() - getNumWorkgroupAttributions(); } @@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ // memory. auto begin = std::next(getBody().args_begin(), - kNumConfigRegionAttributes + getNumWorkgroupAttributions()); + getNumConfigRegionAttributes() + getNumWorkgroupAttributions()); return {begin, getBody().args_end()}; } @@ -871,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ let hasCanonicalizer = 1; let hasCustomAssemblyFormat = 1; let hasRegionVerifier = 1; + let hasVerifier = 1; } def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index dd482f3..0209009 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, Value getBlockSizeZ, Value dynamicSharedMemorySize, Type asyncTokenType, ValueRange asyncDependencies, TypeRange workgroupAttributions, - TypeRange privateAttributions) { + TypeRange privateAttributions, Value clusterSizeX, + Value clusterSizeY, Value clusterSizeZ) { // Add a WorkGroup attribution attribute. This attribute is required to // identify private attributions in the list of block argguments. result.addAttribute(getNumWorkgroupAttributionsAttrName(), @@ -660,6 +661,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, // Add grid and block sizes as op operands, followed by the data operands. result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX, getBlockSizeY, getBlockSizeZ}); + if (clusterSizeX) + result.addOperands(clusterSizeX); + if (clusterSizeY) + result.addOperands(clusterSizeY); + if (clusterSizeZ) + result.addOperands(clusterSizeZ); if (dynamicSharedMemorySize) result.addOperands(dynamicSharedMemorySize); @@ -678,9 +685,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, body->addArgument(argTy, result.location); kernelRegion->push_back(body); // Fill OperandSegmentSize Attribute. - SmallVector segmentSizes(8, 1); + SmallVector segmentSizes(11, 1); segmentSizes.front() = asyncDependencies.size(); segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; + segmentSizes[7] = clusterSizeX ? 1 : 0; + segmentSizes[8] = clusterSizeY ? 1 : 0; + segmentSizes[9] = clusterSizeZ ? 1 : 0; result.addAttribute(getOperandSegmentSizeAttr(), builder.getDenseI32ArrayAttr(segmentSizes)); } @@ -709,6 +719,22 @@ KernelDim3 LaunchOp::getBlockSize() { return KernelDim3{args[9], args[10], args[11]}; } +std::optional LaunchOp::getClusterIds() { + assert(!getBody().empty() && "LaunchOp body must not be empty."); + if (!hasClusterSize()) + return std::nullopt; + auto args = getBody().getArguments(); + return KernelDim3{args[12], args[13], args[14]}; +} + +std::optional LaunchOp::getClusterSize() { + assert(!getBody().empty() && "LaunchOp body must not be empty."); + if (!hasClusterSize()) + return std::nullopt; + auto args = getBody().getArguments(); + return KernelDim3{args[15], args[16], args[17]}; +} + KernelDim3 LaunchOp::getGridSizeOperandValues() { auto operands = getOperands().drop_front(getAsyncDependencies().size()); return KernelDim3{operands[0], operands[1], operands[2]}; @@ -719,6 +745,20 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() { return KernelDim3{operands[3], operands[4], operands[5]}; } +std::optional LaunchOp::getClusterSizeOperandValues() { + auto operands = getOperands().drop_front(getAsyncDependencies().size()); + if (!hasClusterSize()) + return std::nullopt; + return KernelDim3{operands[6], operands[7], operands[8]}; +} + +LogicalResult LaunchOp::verify() { + if (!(hasClusterSize()) && + (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ())) + return emitOpError() << "cluster size must be all present"; + return success(); +} + LogicalResult LaunchOp::verifyRegions() { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments @@ -778,6 +818,12 @@ void LaunchOp::print(OpAsmPrinter &p) { p << " [" << getAsyncDependencies() << ']'; } // Print the launch configuration. + if (hasClusterSize()) { + p << ' ' << getClustersKeyword(); + printSizeAssignment(p, getClusterSize().value(), + getClusterSizeOperandValues().value(), + getClusterIds().value()); + } p << ' ' << getBlocksKeyword(); printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(), getBlockIds()); @@ -831,6 +877,7 @@ parseSizeAssignment(OpAsmParser &parser, /// Parses a Launch operation. /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? +/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional) /// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment /// `threads` `(` ssa-id-list `)` `in` ssa-reassignment /// memory-attribution @@ -840,7 +887,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. SmallVector sizes(LaunchOp::kNumConfigOperands); - MutableArrayRef sizesRef(sizes); // Actual (data) operands passed to the kernel. SmallVector dataOperands; @@ -848,7 +894,6 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Region arguments to be created. SmallVector regionArgs( LaunchOp::kNumConfigRegionAttributes); - MutableArrayRef regionArgsRef(regionArgs); // Parse optional async dependencies. SmallVector asyncDependencies; @@ -861,6 +906,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { if (parser.getNumResults() > 0) result.types.push_back(asyncTokenType); + bool hasCluster = false; + if (succeeded( + parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) { + hasCluster = true; + sizes.resize(9); + regionArgs.resize(18); + } + MutableArrayRef sizesRef(sizes); + MutableArrayRef regionArgsRef(regionArgs); + + // Last three segment assigns the cluster size. In the region argument + // list, this is last 6 arguments. + if (hasCluster) { + if (parseSizeAssignment(parser, sizesRef.drop_front(6), + regionArgsRef.slice(15, 3), + regionArgsRef.slice(12, 3))) + return failure(); + } // Parse the size assignment segments: the first segment assigns grid sizes // and defines values for block identifiers; the second segment assigns block // sizes and defines values for thread identifiers. In the region argument @@ -898,7 +961,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // LaunchOp::getNumWorkgroupAttributionsAttrName(). Type index = parser.getBuilder().getIndexType(); SmallVector dataTypes( - LaunchOp::kNumConfigRegionAttributes, index); + LaunchOp::kNumConfigRegionAttributes + 6, index); SmallVector regionArguments; for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) { @@ -916,8 +979,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Store the number of operands we just parsed as the number of workgroup // memory attributions. - unsigned numWorkgroupAttrs = - regionArguments.size() - LaunchOp::kNumConfigRegionAttributes; + unsigned numWorkgroupAttrs = regionArguments.size() - + LaunchOp::kNumConfigRegionAttributes - + (hasCluster ? 6 : 0); result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(), builder.getI64IntegerAttr(numWorkgroupAttrs)); @@ -934,8 +998,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { parser.parseOptionalAttrDict(result.attributes)) return failure(); - SmallVector segmentSizes(8, 1); + SmallVector segmentSizes(11, 1); segmentSizes.front() = asyncDependencies.size(); + + if (!hasCluster) { + segmentSizes[7] = 0; + segmentSizes[8] = 0; + segmentSizes[9] = 0; + } segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0; result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(), parser.getBuilder().getDenseI32ArrayAttr(segmentSizes)); @@ -992,7 +1062,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) { (*this)->setAttr(attrName, IntegerAttr::get(attr.getType(), attr.getValue() + 1)); return getBody().insertArgument( - LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc); + LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc); } /// Adds a new block argument that corresponds to buffers located in diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 7432a58..2436113 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc, /// entry block of `launchOpBody`, to the corresponding result value of the /// added operations. static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, - Region &launchOpBody, IRMapping &map) { + Region &launchOpBody, IRMapping &map, + bool hasCluster = false) { OpBuilder builder(loc->getContext()); Block &firstBlock = launchOpBody.front(); builder.setInsertionPointToStart(&launchFuncOpBody.front()); - SmallVector indexOps; + SmallVector indexOps; + // The order is important here, as it must match the order of the arguments createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); createForAllDimensions(builder, loc, indexOps); + if (hasCluster) { + createForAllDimensions(builder, loc, indexOps); + createForAllDimensions(builder, loc, indexOps); + } // Replace the leading 12 function args with the respective thread/block index // operations. Iterate backwards since args are erased and indices change. for (const auto &indexOp : enumerate(indexOps)) @@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, IRMapping map; // Map the arguments corresponding to the launch parameters like blockIdx, - // threadIdx, etc. + // threadIdx, etc. If cluster is present, then we also generate clusterIdx and + // clusterDim. Region &outlinedFuncBody = outlinedFunc.getBody(); - injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); + injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map, + launchOp.hasClusterSize()); // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. for (const auto &[launchArg, funcArg] : @@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, // The launch op has an optional dynamic shared memory size. If it doesn't // exist, we use zero. Value asyncToken = launchOp.getAsyncToken(); + std::optional clusterSize = + launchOp.getClusterSizeOperandValues(); auto launchFunc = builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), launchOp.getDynamicSharedMemorySize(), operands, asyncToken ? asyncToken.getType() : nullptr, - launchOp.getAsyncDependencies()); + launchOp.getAsyncDependencies(), clusterSize); launchOp.replaceAllUsesWith(launchFunc); launchOp.erase(); } diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir index a058365..79eef8a 100644 --- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir +++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir @@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref, %B : memref) { // CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index // CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index - // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) - // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) + // CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) + // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) affine.for %i = 0 to 42 { // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]] // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]] diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 8a34d64..4d3a898 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) { ^bb1(%bx: index, %by: index, %bz: index, %tx: index, %ty: index, %tz: index): gpu.terminator - }) {operandSegmentSizes = array} : (index, index, index, index, index, index) -> () + }) {operandSegmentSizes = array} : (index, index, index, index, index, index) -> () return } diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 8020f6d..601add9 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -407,3 +407,77 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { } // CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} + +// ----- +// CHECK: module attributes {gpu.container_module} + +// CHECK-LABEL: func @launch_cluster() +func.func @launch_cluster() { + // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 + %0 = "op"() : () -> (f32) + // CHECK: %[[ARG1:.*]] = "op"() : () -> memref + %1 = "op"() : () -> (memref) + // CHECK: %[[CDIMX:.*]] = arith.constant 1 + %cDimX = arith.constant 1 : index + // CHECK: %[[CDIMY:.*]] = arith.constant 2 + %cDimY = arith.constant 2 : index + // CHECK: %[[CDIMZ:.*]] = arith.constant 1 + %cDimZ = arith.constant 1 : index + // CHECK: %[[GDIMX:.*]] = arith.constant 8 + %gDimX = arith.constant 8 : index + // CHECK: %[[GDIMY:.*]] = arith.constant 12 + %gDimY = arith.constant 12 : index + // CHECK: %[[GDIMZ:.*]] = arith.constant 16 + %gDimZ = arith.constant 16 : index + // CHECK: %[[BDIMX:.*]] = arith.constant 20 + %bDimX = arith.constant 20 : index + // CHECK: %[[BDIMY:.*]] = arith.constant 24 + %bDimY = arith.constant 24 : index + // CHECK: %[[BDIMZ:.*]] = arith.constant 28 + %bDimZ = arith.constant 28 : index + + // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref) + // CHECK-NOT: gpu.launch blocks + gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY, + %cluster_z = %cDimZ) + blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, + %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, + %block_z = %bDimZ) { + "use"(%0): (f32) -> () + "some_op"(%cx, %bx, %block_x) : (index, index, index) -> () + %42 = memref.load %1[%tx] : memref + gpu.terminator + } + return +} + +// CHECK-LABEL: gpu.module @launch_cluster_kernel +// CHECK-NEXT: gpu.func @launch_cluster_kernel +// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) +// CHECK-SAME: gpu.known_block_size = array +// CHECK-SAME: gpu.known_grid_size = array +// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x +// CHECK-NEXT: = gpu.block_id y +// CHECK-NEXT: = gpu.block_id z +// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x +// CHECK-NEXT: = gpu.thread_id y +// CHECK-NEXT: = gpu.thread_id z +// CHECK-NEXT: = gpu.grid_dim x +// CHECK-NEXT: = gpu.grid_dim y +// CHECK-NEXT: = gpu.grid_dim z +// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x +// CHECK-NEXT: = gpu.block_dim y +// CHECK-NEXT: = gpu.block_dim z +// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x +// CHECK-NEXT: = gpu.cluster_id y +// CHECK-NEXT: = gpu.cluster_id z +// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x +// CHECK-NEXT: = gpu.cluster_dim y +// CHECK-NEXT: = gpu.cluster_dim z +// CHECK-NEXT: cf.br ^[[BLOCK:.*]] +// CHECK-NEXT: ^[[BLOCK]]: +// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () +// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> () +// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref + -- cgit v1.1 From 1687555572ee4fb435da400dde02e7a1e60b742c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Sat, 6 Jan 2024 11:28:10 +0100 Subject: [GlobalIsel] Combine select of binops (#76763) --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../llvm/CodeGen/GlobalISel/GenericMachineInstrs.h | 103 ++++++++++++++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 93 +++++++++---- .../CodeGen/AArch64/GlobalISel/combine-select.mir | 151 +++++++++++++++++++++ 4 files changed, 322 insertions(+), 28 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index dcc1a45..f3b6862 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -910,6 +910,9 @@ private: bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo); + /// Try to fold select(cc, binop(), binop()) -> binop(select(), X) + bool tryFoldSelectOfBinOps(GSelect *Select, BuildFnTy &MatchInfo); + bool isOneOrOneSplat(Register Src, bool AllowUndefs); bool isZeroOrZeroSplat(Register Src, bool AllowUndefs); bool isConstantSplatVector(Register Src, int64_t SplatValue, diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 6ab1d455..21d98d3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -558,6 +558,109 @@ public: } }; +// Represents a binary operation, i.e, x = y op z. +class GBinOp : public GenericMachineInstr { +public: + Register getLHSReg() const { return getReg(1); } + Register getRHSReg() const { return getReg(2); } + + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + // Integer. + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_MUL: + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + case TargetOpcode::G_SREM: + case TargetOpcode::G_UREM: + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: + // Floating point. + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMINNUM_IEEE: + case TargetOpcode::G_FMAXNUM_IEEE: + case TargetOpcode::G_FMINIMUM: + case TargetOpcode::G_FMAXIMUM: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FPOW: + // Logical. + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + return true; + default: + return false; + } + }; +}; + +// Represents an integer binary operation. +class GIntBinOp : public GBinOp { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_MUL: + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + case TargetOpcode::G_SREM: + case TargetOpcode::G_UREM: + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: + return true; + default: + return false; + } + }; +}; + +// Represents a floating point binary operation. +class GFBinOp : public GBinOp { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMINNUM_IEEE: + case TargetOpcode::G_FMAXNUM_IEEE: + case TargetOpcode::G_FMINIMUM: + case TargetOpcode::G_FMAXIMUM: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FPOW: + return true; + default: + return false; + } + }; +}; + +// Represents a logical binary operation. +class GLogicalBinOp : public GBinOp { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + return true; + default: + return false; + } + }; +}; } // namespace llvm diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 8b15bdb..5d8def4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6390,8 +6390,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue.isZero() && FalseValue.isOne()) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(CondTy); - B.buildNot(Inner, Cond); + auto Inner = B.buildNot(CondTy, Cond); B.buildZExtOrTrunc(Dest, Inner); }; return true; @@ -6401,8 +6400,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue.isZero() && FalseValue.isAllOnes()) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(CondTy); - B.buildNot(Inner, Cond); + auto Inner = B.buildNot(CondTy, Cond); B.buildSExtOrTrunc(Dest, Inner); }; return true; @@ -6412,8 +6410,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue - 1 == FalseValue) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Inner, Cond); + auto Inner = B.buildZExtOrTrunc(TrueTy, Cond); B.buildAdd(Dest, Inner, False); }; return true; @@ -6423,8 +6420,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue + 1 == FalseValue) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(TrueTy); - B.buildSExtOrTrunc(Inner, Cond); + auto Inner = B.buildSExtOrTrunc(TrueTy, Cond); B.buildAdd(Dest, Inner, False); }; return true; @@ -6434,8 +6430,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue.isPowerOf2() && FalseValue.isZero()) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Inner, Cond); + auto Inner = B.buildZExtOrTrunc(TrueTy, Cond); // The shift amount must be scalar. LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy; auto ShAmtC = B.buildConstant(ShiftTy, TrueValue.exactLogBase2()); @@ -6447,8 +6442,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (TrueValue.isAllOnes()) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Inner = MRI.createGenericVirtualRegister(TrueTy); - B.buildSExtOrTrunc(Inner, Cond); + auto Inner = B.buildSExtOrTrunc(TrueTy, Cond); B.buildOr(Dest, Inner, False, Flags); }; return true; @@ -6458,10 +6452,8 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, if (FalseValue.isAllOnes()) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Not = MRI.createGenericVirtualRegister(CondTy); - B.buildNot(Not, Cond); - Register Inner = MRI.createGenericVirtualRegister(TrueTy); - B.buildSExtOrTrunc(Inner, Not); + auto Not = B.buildNot(CondTy, Cond); + auto Inner = B.buildSExtOrTrunc(TrueTy, Not); B.buildOr(Dest, Inner, True, Flags); }; return true; @@ -6496,8 +6488,7 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select, if ((Cond == True) || isOneOrOneSplat(True, /* AllowUndefs */ true)) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Ext = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Ext, Cond); + auto Ext = B.buildZExtOrTrunc(TrueTy, Cond); B.buildOr(DstReg, Ext, False, Flags); }; return true; @@ -6508,8 +6499,7 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select, if ((Cond == False) || isZeroOrZeroSplat(False, /* AllowUndefs */ true)) { MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); - Register Ext = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Ext, Cond); + auto Ext = B.buildZExtOrTrunc(TrueTy, Cond); B.buildAnd(DstReg, Ext, True); }; return true; @@ -6520,11 +6510,9 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select, MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); // First the not. - Register Inner = MRI.createGenericVirtualRegister(CondTy); - B.buildNot(Inner, Cond); + auto Inner = B.buildNot(CondTy, Cond); // Then an ext to match the destination register. - Register Ext = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Ext, Inner); + auto Ext = B.buildZExtOrTrunc(TrueTy, Inner); B.buildOr(DstReg, Ext, True, Flags); }; return true; @@ -6535,11 +6523,9 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select, MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*Select); // First the not. - Register Inner = MRI.createGenericVirtualRegister(CondTy); - B.buildNot(Inner, Cond); + auto Inner = B.buildNot(CondTy, Cond); // Then an ext to match the destination register. - Register Ext = MRI.createGenericVirtualRegister(TrueTy); - B.buildZExtOrTrunc(Ext, Inner); + auto Ext = B.buildZExtOrTrunc(TrueTy, Inner); B.buildAnd(DstReg, Ext, False); }; return true; @@ -6548,6 +6534,54 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select, return false; } +bool CombinerHelper::tryFoldSelectOfBinOps(GSelect *Select, + BuildFnTy &MatchInfo) { + Register DstReg = Select->getReg(0); + Register Cond = Select->getCondReg(); + Register False = Select->getFalseReg(); + Register True = Select->getTrueReg(); + LLT DstTy = MRI.getType(DstReg); + + GBinOp *LHS = getOpcodeDef(True, MRI); + GBinOp *RHS = getOpcodeDef(False, MRI); + + // We need two binops of the same kind on the true/false registers. + if (!LHS || !RHS || LHS->getOpcode() != RHS->getOpcode()) + return false; + + // Note that there are no constraints on CondTy. + unsigned Flags = (LHS->getFlags() & RHS->getFlags()) | Select->getFlags(); + unsigned Opcode = LHS->getOpcode(); + + // Fold select(cond, binop(x, y), binop(z, y)) + // --> binop(select(cond, x, z), y) + if (LHS->getRHSReg() == RHS->getRHSReg()) { + MatchInfo = [=](MachineIRBuilder &B) { + B.setInstrAndDebugLoc(*Select); + auto Sel = B.buildSelect(DstTy, Cond, LHS->getLHSReg(), RHS->getLHSReg(), + Select->getFlags()); + B.buildInstr(Opcode, {DstReg}, {Sel, LHS->getRHSReg()}, Flags); + }; + return true; + } + + // Fold select(cond, binop(x, y), binop(x, z)) + // --> binop(x, select(cond, y, z)) + if (LHS->getLHSReg() == RHS->getLHSReg()) { + MatchInfo = [=](MachineIRBuilder &B) { + B.setInstrAndDebugLoc(*Select); + auto Sel = B.buildSelect(DstTy, Cond, LHS->getRHSReg(), RHS->getRHSReg(), + Select->getFlags()); + B.buildInstr(Opcode, {DstReg}, {LHS->getLHSReg(), Sel}, Flags); + }; + return true; + } + + // FIXME: use isCommutable(). + + return false; +} + bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) { GSelect *Select = cast(&MI); @@ -6557,5 +6591,8 @@ bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) { if (tryFoldBoolSelectToLogic(Select, MatchInfo)) return true; + if (tryFoldSelectOfBinOps(Select, MatchInfo)) + return true; + return false; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir index be2de62..c5a3490 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir @@ -544,3 +544,154 @@ body: | %ext:_(s32) = G_ANYEXT %sel $w0 = COPY %ext(s32) ... +--- +# select cond, and(x, y), and(z, y) --> and (select cond, x, z), y +name: select_cond_and_x_y_and_z_y_and_select_x_z_y +body: | + bb.1: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: select_cond_and_x_y_and_z_y_and_select_x_z_y + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: %a:_(s8) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %b:_(s8) = G_TRUNC [[COPY2]](s64) + ; CHECK-NEXT: %d:_(s8) = G_TRUNC [[COPY3]](s64) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s8) = G_SELECT %c(s1), %a, %d + ; CHECK-NEXT: %sel:_(s8) = G_AND [[SELECT]], %b + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8) + ; CHECK-NEXT: $w0 = COPY %ext(s32) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %c:_(s1) = G_TRUNC %0 + %a:_(s8) = G_TRUNC %1 + %b:_(s8) = G_TRUNC %2 + %d:_(s8) = G_TRUNC %3 + %e:_(s8) = G_TRUNC %4 + %and1:_(s8) = G_AND %a, %b + %and2:_(s8) = G_AND %d, %b + %sel:_(s8) = G_SELECT %c, %and1, %and2 + %ext:_(s32) = G_ANYEXT %sel + $w0 = COPY %ext(s32) +... +--- +# select cond, xor(x, y), xor(x, z) --> xor x, select, x, z) +name: select_cond_xor_x_y_xor_x_z_xor_x__select_x_y +body: | + bb.1: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: select_cond_xor_x_y_xor_x_z_xor_x__select_x_y + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: %a:_(s8) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %d:_(s8) = G_TRUNC [[COPY2]](s64) + ; CHECK-NEXT: %e:_(s8) = G_TRUNC [[COPY3]](s64) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s8) = G_SELECT %c(s1), %e, %d + ; CHECK-NEXT: %sel:_(s8) = G_XOR %a, [[SELECT]] + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8) + ; CHECK-NEXT: $w0 = COPY %ext(s32) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %c:_(s1) = G_TRUNC %0 + %a:_(s8) = G_TRUNC %1 + %b:_(s8) = G_TRUNC %2 + %d:_(s8) = G_TRUNC %3 + %e:_(s8) = G_TRUNC %4 + %xor1:_(s8) = G_XOR %a, %e + %xor2:_(s8) = G_XOR %a, %d + %sel:_(s8) = G_SELECT %c, %xor1, %xor2 + %ext:_(s32) = G_ANYEXT %sel + $w0 = COPY %ext(s32) +... +--- +# negative test select cond, and(x, y), or(z, a) --> failed +name: select_cond_and_x_y_or_z_a_failed +body: | + bb.1: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: select_cond_and_x_y_or_z_a_failed + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: %a:_(s8) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %b:_(s8) = G_TRUNC [[COPY2]](s64) + ; CHECK-NEXT: %d:_(s8) = G_TRUNC [[COPY3]](s64) + ; CHECK-NEXT: %e:_(s8) = G_TRUNC [[COPY4]](s64) + ; CHECK-NEXT: %and1:_(s8) = G_AND %a, %b + ; CHECK-NEXT: %or2:_(s8) = G_OR %e, %d + ; CHECK-NEXT: %sel:_(s8) = G_SELECT %c(s1), %and1, %or2 + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8) + ; CHECK-NEXT: $w0 = COPY %ext(s32) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %c:_(s1) = G_TRUNC %0 + %a:_(s8) = G_TRUNC %1 + %b:_(s8) = G_TRUNC %2 + %d:_(s8) = G_TRUNC %3 + %e:_(s8) = G_TRUNC %4 + %and1:_(s8) = G_AND %a, %b + %or2:_(s8) = G_OR %e, %d + %sel:_(s8) = G_SELECT %c, %and1, %or2 + %ext:_(s32) = G_ANYEXT %sel + $w0 = COPY %ext(s32) +... +--- +# flags test select cond, xor(x, y), xor(x, z) --> xor x, select, cond, x, z) +name: flags_select_cond_xor_x_y_xor_x_z_xor_x__select_cond_x_y +body: | + bb.1: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: flags_select_cond_xor_x_y_xor_x_z_xor_x__select_cond_x_y + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: %a:_(s8) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %d:_(s8) = G_TRUNC [[COPY2]](s64) + ; CHECK-NEXT: %e:_(s8) = G_TRUNC [[COPY3]](s64) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s8) = ninf exact G_SELECT %c(s1), %e, %d + ; CHECK-NEXT: %sel:_(s8) = ninf arcp exact G_XOR %a, [[SELECT]] + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8) + ; CHECK-NEXT: $w0 = COPY %ext(s32) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %c:_(s1) = G_TRUNC %0 + %a:_(s8) = G_TRUNC %1 + %b:_(s8) = G_TRUNC %2 + %d:_(s8) = G_TRUNC %3 + %e:_(s8) = G_TRUNC %4 + %xor1:_(s8) = nsz arcp nsw G_XOR %a, %e + %xor2:_(s8) = nnan arcp nuw G_XOR %a, %d + %sel:_(s8) = ninf exact G_SELECT %c, %xor1, %xor2 + %ext:_(s32) = G_ANYEXT %sel + $w0 = COPY %ext(s32) +... -- cgit v1.1 From 0abf3a93a3088140c0585672c8b852e5db93a302 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 18:30:37 +0800 Subject: [X86][NFC] Use single table for EVEX compression This patch is to address my review comments in #77065 to simplify the implemention of EVEX2Legacy compression. --- llvm/lib/Target/X86/X86CompressEVEX.cpp | 26 ++++------------ .../TableGen/X86CompressEVEXTablesEmitter.cpp | 35 +++++++--------------- 2 files changed, 15 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index accb98c..fc980c6 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -224,16 +224,7 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { return true; } -// For EVEX instructions that can be encoded using VEX encoding -// replace them by the VEX encoding in order to reduce size. -static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { - // VEX format. - // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 - // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] - // - // EVEX format. - // # of bytes: 4 1 1 1 4 / 1 1 - // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] +static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { const MCInstrDesc &Desc = MI.getDesc(); // Check for EVEX instructions only. @@ -251,10 +242,7 @@ static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { if (Desc.TSFlags & X86II::EVEX_L2) return false; - // Use the VEX.L bit to select the 128 or 256-bit table. - ArrayRef Table = - (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable) - : ArrayRef(X86EvexToVex128CompressTable); + ArrayRef Table = ArrayRef(X86CompressEVEXTable); unsigned Opc = MI.getOpcode(); const auto *I = llvm::lower_bound(Table, Opc); @@ -278,10 +266,8 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { // Make sure the tables are sorted. static std::atomic TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(llvm::is_sorted(X86EvexToVex128CompressTable) && - "X86EvexToVex128CompressTable is not sorted!"); - assert(llvm::is_sorted(X86EvexToVex256CompressTable) && - "X86EvexToVex256CompressTable is not sorted!"); + assert(llvm::is_sorted(X86CompressEVEXTable) && + "X86CompressEVEXTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif @@ -291,12 +277,10 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - /// Go over all basic blocks in function and replace - /// EVEX encoded instrs by VEX encoding when possible. for (MachineBasicBlock &MBB : MF) { // Traverse the basic block. for (MachineInstr &MI : MBB) - Changed |= CompressEvexToVexImpl(MI, ST); + Changed |= CompressEVEXImpl(MI, ST); } return Changed; diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp index a45e87a..0fcd0b0 100644 --- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -47,9 +47,7 @@ class X86CompressEVEXTablesEmitter { typedef std::pair Entry; - // Represent both compress tables - std::vector EVEX2VEX128; - std::vector EVEX2VEX256; + std::vector Table; public: X86CompressEVEXTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {} @@ -64,20 +62,13 @@ private: void X86CompressEVEXTablesEmitter::printTable(const std::vector &Table, raw_ostream &OS) { - StringRef Size = (Table == EVEX2VEX128) ? "128" : "256"; - OS << "// X86 EVEX encoded instructions that have a VEX " << Size - << " encoding\n" - << "// (table format: ).\n" - << "static const X86CompressEVEXTableEntry X86EvexToVex" << Size - << "CompressTable[] = {\n" - << " // EVEX scalar with corresponding VEX.\n"; + OS << "static const X86CompressEVEXTableEntry X86CompressEVEXTable[] = { \n"; // Print all entries added to the table - for (const auto &Pair : Table) { + for (const auto &Pair : Table) OS << " { X86::" << Pair.first->TheDef->getName() << ", X86::" << Pair.second->TheDef->getName() << " },\n"; - } OS << "};\n\n"; } @@ -175,33 +166,27 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { const Record *Rec = Inst->TheDef; uint64_t Opcode = getValueFromBitsInit(Inst->TheDef->getValueAsBitsInit("Opcode")); - const CodeGenInstruction *VEXInst = nullptr; + const CodeGenInstruction *NewInst = nullptr; if (ManualMap.find(Rec->getName()) != ManualMap.end()) { Record *NewRec = Records.getDef(ManualMap.at(Rec->getName())); assert(NewRec && "Instruction not found!"); - VEXInst = &Target.getInstruction(NewRec); + NewInst = &Target.getInstruction(NewRec); } else { - // For each EVEX instruction look for a VEX match in the appropriate + // For each pre-compression instruction look for a match in the appropriate // vector (instructions with the same opcode) using function object // IsMatch. auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst)); if (Match != CompressedInsts[Opcode].end()) - VEXInst = *Match; + NewInst = *Match; } - if (!VEXInst) + if (!NewInst) continue; - // In case a match is found add new entry to the appropriate table - if (Rec->getValueAsBit("hasVEX_L")) - EVEX2VEX256.push_back(std::make_pair(Inst, VEXInst)); // {0,1} - else - EVEX2VEX128.push_back(std::make_pair(Inst, VEXInst)); // {0,0} + Table.push_back(std::make_pair(Inst, NewInst)); } - // Print both tables - printTable(EVEX2VEX128, OS); - printTable(EVEX2VEX256, OS); + printTable(Table, OS); } } // namespace -- cgit v1.1 From bd0dc357af453f03770c5d43c66ee5a3584abdca Mon Sep 17 00:00:00 2001 From: Abhinav271828 <71174780+Abhinav271828@users.noreply.github.com> Date: Sat, 6 Jan 2024 17:08:25 +0530 Subject: [MLIR][Presburger] Shift GeneratingFunction.h to includes (#77114) We shift the GeneratingFunction.h header file to the include/ directory and wrap it in a `detail` namespace. --- .../mlir/Analysis/Presburger/GeneratingFunction.h | 137 +++++++++++++++++++++ mlir/lib/Analysis/Presburger/GeneratingFunction.h | 134 -------------------- mlir/unittests/Analysis/Presburger/CMakeLists.txt | 1 + .../Analysis/Presburger/GeneratingFunctionTest.cpp | 39 ++++++ mlir/unittests/Analysis/Presburger/Utils.h | 36 +++++- 5 files changed, 212 insertions(+), 135 deletions(-) create mode 100644 mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h delete mode 100644 mlir/lib/Analysis/Presburger/GeneratingFunction.h create mode 100644 mlir/unittests/Analysis/Presburger/GeneratingFunctionTest.cpp diff --git a/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h b/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h new file mode 100644 index 0000000..4dd692c --- /dev/null +++ b/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h @@ -0,0 +1,137 @@ +//===- GeneratingFunction.h - Generating Functions over Q^d -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definition of the GeneratingFunction class for Barvinok's algorithm, +// which represents a function over Q^n, parameterized by d parameters. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H +#define MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H + +#include "mlir/Analysis/Presburger/Fraction.h" +#include "mlir/Analysis/Presburger/Matrix.h" + +namespace mlir { +namespace presburger { +namespace detail { + +// A parametric point is a vector, each of whose elements +// is an affine function of n parameters. Each row +// in the matrix represents the affine function and +// has n+1 elements. +using ParamPoint = FracMatrix; + +// A point is simply a vector. +using Point = SmallVector; + +// A class to describe the type of generating function +// used to enumerate the integer points in a polytope. +// Consists of a set of terms, where the ith term has +// * a sign, ±1, stored in `signs[i]` +// * a numerator, of the form x^{n}, +// where n, stored in `numerators[i]`, +// is a parametric point. +// * a denominator, of the form (1 - x^{d1})...(1 - x^{dn}), +// where each dj, stored in `denominators[i][j]`, +// is a vector. +// +// Represents functions f_p : Q^n -> Q of the form +// +// f_p(x) = \sum_i s_i * (x^n_i(p)) / (\prod_j (1 - x^d_{ij}) +// +// where s_i is ±1, +// n_i \in Q^d -> Q^n is an n-vector of affine functions on d parameters, and +// g_{ij} \in Q^n are vectors. +class GeneratingFunction { +public: + GeneratingFunction(unsigned numParam, SmallVector signs, + std::vector nums, + std::vector> dens) + : numParam(numParam), signs(signs), numerators(nums), denominators(dens) { +#ifndef NDEBUG + for (const ParamPoint &term : numerators) + assert(term.getNumColumns() == numParam + 1 && + "dimensionality of numerator exponents does not match number of " + "parameters!"); +#endif // NDEBUG + } + + unsigned getNumParams() { return numParam; } + + SmallVector getSigns() { return signs; } + + std::vector getNumerators() { return numerators; } + + std::vector> getDenominators() { return denominators; } + + GeneratingFunction operator+(GeneratingFunction &gf) const { + assert(numParam == gf.getNumParams() && + "two generating functions with different numbers of parameters " + "cannot be added!"); + SmallVector sumSigns = signs; + sumSigns.append(gf.signs); + + std::vector sumNumerators = numerators; + sumNumerators.insert(sumNumerators.end(), gf.numerators.begin(), + gf.numerators.end()); + + std::vector> sumDenominators = denominators; + sumDenominators.insert(sumDenominators.end(), gf.denominators.begin(), + gf.denominators.end()); + return GeneratingFunction(numParam, sumSigns, sumNumerators, + sumDenominators); + } + + llvm::raw_ostream &print(llvm::raw_ostream &os) const { + for (unsigned i = 0, e = signs.size(); i < e; i++) { + if (i == 0) { + if (signs[i] == -1) + os << "- "; + } else { + if (signs[i] == 1) + os << " + "; + else + os << " - "; + } + + os << "x^["; + unsigned r = numerators[i].getNumRows(); + for (unsigned j = 0; j < r - 1; j++) { + os << "["; + for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++) + os << numerators[i].at(j, k) << ","; + os << numerators[i].getRow(j).back() << "],"; + } + os << "["; + for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++) + os << numerators[i].at(r - 1, k) << ","; + os << numerators[i].getRow(r - 1).back() << "]]/"; + + for (const Point &den : denominators[i]) { + os << "(x^["; + for (unsigned j = 0, e = den.size(); j < e - 1; j++) + os << den[j] << ","; + os << den.back() << "])"; + } + } + return os; + } + +private: + unsigned numParam; + SmallVector signs; + std::vector numerators; + std::vector> denominators; +}; + +} // namespace detail +} // namespace presburger +} // namespace mlir + +#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H diff --git a/mlir/lib/Analysis/Presburger/GeneratingFunction.h b/mlir/lib/Analysis/Presburger/GeneratingFunction.h deleted file mode 100644 index f7deba9..0000000 --- a/mlir/lib/Analysis/Presburger/GeneratingFunction.h +++ /dev/null @@ -1,134 +0,0 @@ -//===- GeneratingFunction.h - Generating Functions over Q^d -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definition of the GeneratingFunction class for Barvinok's algorithm, -// which represents a function over Q^n, parameterized by d parameters. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H -#define MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H - -#include "mlir/Analysis/Presburger/Fraction.h" -#include "mlir/Analysis/Presburger/Matrix.h" - -namespace mlir { -namespace presburger { - -// A parametric point is a vector, each of whose elements -// is an affine function of n parameters. Each row -// in the matrix represents the affine function and -// has n+1 elements. -using ParamPoint = FracMatrix; - -// A point is simply a vector. -using Point = SmallVector; - -// A class to describe the type of generating function -// used to enumerate the integer points in a polytope. -// Consists of a set of terms, where the ith term has -// * a sign, ±1, stored in `signs[i]` -// * a numerator, of the form x^{n}, -// where n, stored in `numerators[i]`, -// is a parametric point. -// * a denominator, of the form (1 - x^{d1})...(1 - x^{dn}), -// where each dj, stored in `denominators[i][j]`, -// is a vector. -// -// Represents functions f_p : Q^n -> Q of the form -// -// f_p(x) = \sum_i s_i * (x^n_i(p)) / (\prod_j (1 - x^d_{ij}) -// -// where s_i is ±1, -// n_i \in Q^d -> Q^n is an n-vector of affine functions on d parameters, and -// g_{ij} \in Q^n are vectors. -class GeneratingFunction { -public: - GeneratingFunction(unsigned numParam, SmallVector signs, - std::vector nums, - std::vector> dens) - : numParam(numParam), signs(signs), numerators(nums), denominators(dens) { -#ifndef NDEBUG - for (const ParamPoint &term : numerators) - assert(term.getNumColumns() == numParam + 1 && - "dimensionality of numerator exponents does not match number of " - "parameters!"); -#endif // NDEBUG - } - - unsigned getNumParams() { return numParam; } - - SmallVector getSigns() { return signs; } - - std::vector getNumerators() { return numerators; } - - std::vector> getDenominators() { return denominators; } - - GeneratingFunction operator+(GeneratingFunction &gf) const { - assert(numParam == gf.getNumParams() && - "two generating functions with different numbers of parameters " - "cannot be added!"); - SmallVector sumSigns = signs; - sumSigns.append(gf.signs); - - std::vector sumNumerators = numerators; - sumNumerators.insert(sumNumerators.end(), gf.numerators.begin(), - gf.numerators.end()); - - std::vector> sumDenominators = denominators; - sumDenominators.insert(sumDenominators.end(), gf.denominators.begin(), - gf.denominators.end()); - return GeneratingFunction(0, sumSigns, sumNumerators, sumDenominators); - } - - llvm::raw_ostream &print(llvm::raw_ostream &os) const { - for (unsigned i = 0, e = signs.size(); i < e; i++) { - if (i == 0) { - if (signs[i] == -1) - os << "- "; - } else { - if (signs[i] == 1) - os << " + "; - else - os << " - "; - } - - os << "x^["; - unsigned r = numerators[i].getNumRows(); - for (unsigned j = 0; j < r - 1; j++) { - os << "["; - for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++) - os << numerators[i].at(j, k) << ","; - os << numerators[i].getRow(j).back() << "],"; - } - os << "["; - for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++) - os << numerators[i].at(r - 1, k) << ","; - os << numerators[i].getRow(r - 1).back() << "]]/"; - - for (const Point &den : denominators[i]) { - os << "(x^["; - for (unsigned j = 0, e = den.size(); j < e - 1; j++) - os << den[j] << ","; - os << den.back() << "])"; - } - } - return os; - } - -private: - unsigned numParam; - SmallVector signs; - std::vector numerators; - std::vector> denominators; -}; - -} // namespace presburger -} // namespace mlir - -#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H diff --git a/mlir/unittests/Analysis/Presburger/CMakeLists.txt b/mlir/unittests/Analysis/Presburger/CMakeLists.txt index e371333..54a8417 100644 --- a/mlir/unittests/Analysis/Presburger/CMakeLists.txt +++ b/mlir/unittests/Analysis/Presburger/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIRPresburgerTests FractionTest.cpp + GeneratingFunctionTest.cpp IntegerPolyhedronTest.cpp IntegerRelationTest.cpp LinearTransformTest.cpp diff --git a/mlir/unittests/Analysis/Presburger/GeneratingFunctionTest.cpp b/mlir/unittests/Analysis/Presburger/GeneratingFunctionTest.cpp new file mode 100644 index 0000000..5df1a5a --- /dev/null +++ b/mlir/unittests/Analysis/Presburger/GeneratingFunctionTest.cpp @@ -0,0 +1,39 @@ +//===- MatrixTest.cpp - Tests for QuasiPolynomial -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Presburger/GeneratingFunction.h" +#include "./Utils.h" +#include +#include + +using namespace mlir; +using namespace presburger; +using namespace mlir::presburger::detail; + +TEST(GeneratingFunctionTest, sum) { + GeneratingFunction gf1(2, {1, -1}, + {makeFracMatrix(2, 3, {{1, 2, 5}, {7, 2, 6}}), + makeFracMatrix(2, 3, {{5, 2, 5}, {3, 7, 2}})}, + {{{3, 6}, {7, 2}}, {{2, 8}, {6, 3}}}); + GeneratingFunction gf2(2, {1, 1}, + {makeFracMatrix(2, 3, {{6, 2, 1}, {4, 2, 6}}), + makeFracMatrix(2, 3, {{3, 2, 6}, {9, 2, 5}})}, + {{{3, 7}, {5, 1}}, {{5, 2}, {6, 2}}}); + + GeneratingFunction sum = gf1 + gf2; + EXPECT_EQ_REPR_GENERATINGFUNCTION( + sum, GeneratingFunction(2, {1, -1, 1, 1}, + {makeFracMatrix(2, 3, {{1, 2, 5}, {7, 2, 6}}), + makeFracMatrix(2, 3, {{5, 2, 5}, {3, 7, 2}}), + makeFracMatrix(2, 3, {{6, 2, 1}, {4, 2, 6}}), + makeFracMatrix(2, 3, {{3, 2, 6}, {9, 2, 5}})}, + {{{3, 6}, {7, 2}}, + {{2, 8}, {6, 3}}, + {{3, 7}, {5, 1}}, + {{5, 2}, {6, 2}}})); +} diff --git a/mlir/unittests/Analysis/Presburger/Utils.h b/mlir/unittests/Analysis/Presburger/Utils.h index 2a9966c..6b00898a 100644 --- a/mlir/unittests/Analysis/Presburger/Utils.h +++ b/mlir/unittests/Analysis/Presburger/Utils.h @@ -13,6 +13,7 @@ #ifndef MLIR_UNITTESTS_ANALYSIS_PRESBURGER_UTILS_H #define MLIR_UNITTESTS_ANALYSIS_PRESBURGER_UTILS_H +#include "mlir/Analysis/Presburger/GeneratingFunction.h" #include "mlir/Analysis/Presburger/IntegerRelation.h" #include "mlir/Analysis/Presburger/Matrix.h" #include "mlir/Analysis/Presburger/PWMAFunction.h" @@ -72,9 +73,42 @@ inline void EXPECT_EQ_FRAC_MATRIX(FracMatrix a, FracMatrix b) { EXPECT_EQ(a(row, col), b(row, col)); } +// Check the coefficients (in order) of two generating functions. +// Note that this is not a true equality check. +inline void EXPECT_EQ_REPR_GENERATINGFUNCTION(detail::GeneratingFunction a, + detail::GeneratingFunction b) { + EXPECT_EQ(a.getNumParams(), b.getNumParams()); + + SmallVector aSigns = a.getSigns(); + SmallVector bSigns = b.getSigns(); + EXPECT_EQ(aSigns.size(), bSigns.size()); + for (unsigned i = 0, e = aSigns.size(); i < e; i++) + EXPECT_EQ(aSigns[i], bSigns[i]); + + std::vector aNums = a.getNumerators(); + std::vector bNums = b.getNumerators(); + EXPECT_EQ(aNums.size(), bNums.size()); + for (unsigned i = 0, e = aNums.size(); i < e; i++) + EXPECT_EQ_FRAC_MATRIX(aNums[i], bNums[i]); + + std::vector> aDens = a.getDenominators(); + std::vector> bDens = b.getDenominators(); + EXPECT_EQ(aDens.size(), bDens.size()); + for (unsigned i = 0, e = aDens.size(); i < e; i++) { + EXPECT_EQ(aDens[i].size(), bDens[i].size()); + for (unsigned j = 0, f = aDens[i].size(); j < f; j++) { + EXPECT_EQ(aDens[i][j].size(), bDens[i][j].size()); + for (unsigned k = 0, g = aDens[i][j].size(); k < g; k++) { + EXPECT_EQ(aDens[i][j][k], bDens[i][j][k]); + } + } + } +} + // Check the coefficients (in order) of two quasipolynomials. // Note that this is not a true equality check. -inline void EXPECT_EQ_REPR_QUASIPOLYNOMIAL(QuasiPolynomial a, QuasiPolynomial b) { +inline void EXPECT_EQ_REPR_QUASIPOLYNOMIAL(QuasiPolynomial a, + QuasiPolynomial b) { EXPECT_EQ(a.getNumInputs(), b.getNumInputs()); SmallVector aCoeffs = a.getCoefficients(), -- cgit v1.1 From d08482924efe8b2c44913583af7b8f60a29975d1 Mon Sep 17 00:00:00 2001 From: Qizhi Hu <836744285@qq.com> Date: Sat, 6 Jan 2024 19:50:00 +0800 Subject: [clang-tidy] fix false positive in cppcoreguidelines-missing-std-forward (#77056) Parameter variable which is forwarded in lambda capture list or in body by reference is reasonable and current version of this check produces false positive on these cases. This patch try to fix the [issue](https://github.com/llvm/llvm-project/issues/68105) Co-authored-by: huqizhi <836744285@qq.com> --- .../cppcoreguidelines/MissingStdForwardCheck.cpp | 60 +++++++++++++++++++++- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++ .../cppcoreguidelines/missing-std-forward.cpp | 31 +++++++++-- 3 files changed, 90 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp index 0b85ea1..370de12 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp @@ -53,16 +53,72 @@ AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) { FuncTemplate->getTemplateParameters()->getDepth(); } +AST_MATCHER_P(NamedDecl, hasSameNameAsBoundNode, std::string, BindingID) { + IdentifierInfo *II = Node.getIdentifier(); + if (nullptr == II) + return false; + StringRef Name = II->getName(); + + return Builder->removeBindings( + [this, Name](const ast_matchers::internal::BoundNodesMap &Nodes) { + const DynTypedNode &BN = Nodes.getNode(this->BindingID); + if (const auto *ND = BN.get()) { + if (!isa(ND)) + return true; + return ND->getName() != Name; + } + return true; + }); +} + +AST_MATCHER_P(LambdaCapture, hasCaptureKind, LambdaCaptureKind, Kind) { + return Node.getCaptureKind() == Kind; +} + +AST_MATCHER_P(LambdaExpr, hasCaptureDefaultKind, LambdaCaptureDefault, Kind) { + return Node.getCaptureDefault() == Kind; +} + } // namespace void MissingStdForwardCheck::registerMatchers(MatchFinder *Finder) { + auto RefToParmImplicit = allOf( + equalsBoundNode("var"), hasInitializer(ignoringParenImpCasts( + declRefExpr(to(equalsBoundNode("param")))))); + auto RefToParm = capturesVar( + varDecl(anyOf(hasSameNameAsBoundNode("param"), RefToParmImplicit))); + auto HasRefToParm = hasAnyCapture(RefToParm); + + auto CaptureInRef = + allOf(hasCaptureDefaultKind(LambdaCaptureDefault::LCD_ByRef), + unless(hasAnyCapture( + capturesVar(varDecl(hasSameNameAsBoundNode("param")))))); + auto CaptureInCopy = allOf( + hasCaptureDefaultKind(LambdaCaptureDefault::LCD_ByCopy), HasRefToParm); + auto CaptureByRefExplicit = hasAnyCapture( + allOf(hasCaptureKind(LambdaCaptureKind::LCK_ByRef), RefToParm)); + + auto CapturedInBody = + lambdaExpr(anyOf(CaptureInRef, CaptureInCopy, CaptureByRefExplicit)); + auto CapturedInCaptureList = hasAnyCapture(capturesVar( + varDecl(hasInitializer(ignoringParenImpCasts(equalsBoundNode("call")))))); + + auto CapturedInLambda = hasDeclContext(cxxRecordDecl( + isLambda(), + hasParent(lambdaExpr(forCallable(equalsBoundNode("func")), + anyOf(CapturedInCaptureList, CapturedInBody))))); + auto ToParam = hasAnyParameter(parmVarDecl(equalsBoundNode("param"))); auto ForwardCallMatcher = callExpr( - forCallable(equalsBoundNode("func")), argumentCountIs(1), + callExpr().bind("call"), argumentCountIs(1), + hasArgument( + 0, declRefExpr(to( + varDecl(optionally(equalsBoundNode("param"))).bind("var")))), + forCallable(anyOf(equalsBoundNode("func"), CapturedInLambda)), callee(unresolvedLookupExpr(hasAnyDeclaration( namedDecl(hasUnderlyingDecl(hasName("::std::forward")))))), - hasArgument(0, declRefExpr(to(equalsBoundNode("param"))).bind("ref")), + unless(anyOf(hasAncestor(typeLoc()), hasAncestor(expr(hasUnevaluatedContext()))))); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 08ade30..1bd5a72 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -301,6 +301,10 @@ Changes in existing checks coroutine functions and increase issue detection for cases involving type aliases with references. +- Improved :doc:`cppcoreguidelines-missing-std-forward + ` check to + address false positives in the capture list and body of lambdas. + - Improved :doc:`cppcoreguidelines-narrowing-conversions ` check by extending the `IgnoreConversionFromTypes` option to include types without a diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp index b9720db..443f338 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp @@ -90,9 +90,9 @@ void lambda_value_capture(T&& t) { } template -void lambda_value_reference(T&& t) { - // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: forwarding reference parameter 't' is never forwarded inside the function body [cppcoreguidelines-missing-std-forward] - [&]() { T other = std::forward(t); }; +void lambda_value_capture_copy(T&& t) { + // CHECK-MESSAGES: :[[@LINE-1]]:36: warning: forwarding reference parameter 't' is never forwarded inside the function body [cppcoreguidelines-missing-std-forward] + [&,t]() { T other = std::forward(t); }; } } // namespace positive_cases @@ -147,4 +147,29 @@ class AClass { T data; }; +template +void lambda_value_reference(T&& t) { + [&]() { T other = std::forward(t); }; +} + +template +void lambda_value_reference_capture_list_ref_1(T&& t) { + [=, &t] { T other = std::forward(t); }; +} + +template +void lambda_value_reference_capture_list_ref_2(T&& t) { + [&t] { T other = std::forward(t); }; +} + +template +void lambda_value_reference_capture_list(T&& t) { + [t = std::forward(t)] { t(); }; +} + +template +void lambda_value_reference_auxiliary_var(T&& t) { + [&x = t]() { T other = std::forward(x); }; +} + } // namespace negative_cases -- cgit v1.1 From 3fb0d8dc808cb7f315670d76109edc9c57cb3d90 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 6 Jan 2024 12:08:03 +0000 Subject: Recommit "[VPlan] Mark Select VPInstructions as not having sideeffects." With #70253 landed, selects for reduction results are explicitly used by ComputeReductionResult and Selects can be marked as not having side-effects again. This reverts the revert commit 173032902c960d4d0d67b521d8c149553d8e8ba3. --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + llvm/test/Transforms/LoopVectorize/reduction-small-size.ll | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1e5273b..c9def41 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -121,6 +121,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInstructionSC: switch (cast(this)->getOpcode()) { case Instruction::ICmp: + case Instruction::Select: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index 3973a28..2a58748d 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -11,15 +11,12 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8> @@ -40,7 +37,7 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ] -; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: ; CHECK-NEXT: [[T0:%.*]] = sdiv i32 undef, undef ; CHECK-NEXT: br label [[IF_END]] -- cgit v1.1 From 4b9bbd38686af3dbffd45b360bd5af629426bdbc Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 21:38:12 +0800 Subject: [X86][NFC] Refine code in X86CompressEVEXTablesEmitter.cpp 1. Simplify getValueFromBitsInit about cast and return type 2. Remove out-of-date comments and allow memory ops in function object `IsMatch` so that we can reuse it for EVEX2Legacy compression. This patch is to extract NFC in #77065 into a separate commit. --- .../TableGen/X86CompressEVEXTablesEmitter.cpp | 54 +++++++++------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp index 0fcd0b0..82a9bfe 100644 --- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp @@ -73,15 +73,14 @@ void X86CompressEVEXTablesEmitter::printTable(const std::vector &Table, OS << "};\n\n"; } -// Return true if the 2 BitsInits are equal -// Calculates the integer value residing BitsInit object -static inline uint64_t getValueFromBitsInit(const BitsInit *B) { - uint64_t Value = 0; - for (unsigned i = 0, e = B->getNumBits(); i != e; ++i) { - if (BitInit *Bit = dyn_cast(B->getBit(i))) - Value |= uint64_t(Bit->getValue()) << i; - else - PrintFatalError("Invalid VectSize bit"); +static uint8_t byteFromBitsInit(const BitsInit *B) { + unsigned N = B->getNumBits(); + assert(N <= 8 && "Field is too large for uint8_t!"); + + uint8_t Value = 0; + for (unsigned I = 0; I != N; ++I) { + BitInit *Bit = cast(B->getBit(I)); + Value |= Bit->getValue() << I; } return Value; } @@ -105,30 +104,23 @@ public: NewRI.Form)) return false; - // This is needed for instructions with intrinsic version (_Int). - // Where the only difference is the size of the operands. - // For example: VUCOMISDZrm and Int_VUCOMISDrm - // Also for instructions that their EVEX version was upgraded to work with - // k-registers. For example VPCMPEQBrm (xmm output register) and - // VPCMPEQBZ128rm (k register output register). - for (unsigned i = 0, e = OldInst->Operands.size(); i < e; i++) { - Record *OpRec1 = OldInst->Operands[i].Rec; - Record *OpRec2 = NewInst->Operands[i].Rec; - - if (OpRec1 == OpRec2) + for (unsigned I = 0, E = OldInst->Operands.size(); I < E; ++I) { + Record *OldOpRec = OldInst->Operands[I].Rec; + Record *NewOpRec = NewInst->Operands[I].Rec; + + if (OldOpRec == NewOpRec) continue; - if (isRegisterOperand(OpRec1) && isRegisterOperand(OpRec2)) { - if (getRegOperandSize(OpRec1) != getRegOperandSize(OpRec2)) + if (isRegisterOperand(OldOpRec) && isRegisterOperand(NewOpRec)) { + if (getRegOperandSize(OldOpRec) != getRegOperandSize(NewOpRec)) + return false; + } else if (isMemoryOperand(OldOpRec) && isMemoryOperand(NewOpRec)) { + if (getMemOperandSize(OldOpRec) != getMemOperandSize(NewOpRec)) return false; - } else if (isMemoryOperand(OpRec1) && isMemoryOperand(OpRec2)) { - return false; - } else if (isImmediateOperand(OpRec1) && isImmediateOperand(OpRec2)) { - if (OpRec1->getValueAsDef("Type") != OpRec2->getValueAsDef("Type")) { + } else if (isImmediateOperand(OldOpRec) && isImmediateOperand(NewOpRec)) { + if (OldOpRec->getValueAsDef("Type") != NewOpRec->getValueAsDef("Type")) return false; - } - } else - return false; + } } return true; @@ -164,8 +156,8 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) { for (const CodeGenInstruction *Inst : PreCompressionInsts) { const Record *Rec = Inst->TheDef; - uint64_t Opcode = - getValueFromBitsInit(Inst->TheDef->getValueAsBitsInit("Opcode")); + uint8_t Opcode = + byteFromBitsInit(Inst->TheDef->getValueAsBitsInit("Opcode")); const CodeGenInstruction *NewInst = nullptr; if (ManualMap.find(Rec->getName()) != ManualMap.end()) { Record *NewRec = Records.getDef(ManualMap.at(Rec->getName())); -- cgit v1.1 From 8bbf100799a97f8342bf1a8409c6fb48f03e837f Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Sat, 6 Jan 2024 22:03:03 +0800 Subject: [X86][NFC] Remove dead code for "_REV" instructions Those "_REV" instructions should not appear before encoding optimization, while macro fusion and flag-copy lowering are before encoding optimization. --- llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 16 ---------------- llvm/lib/Target/X86/X86FlagsCopyLowering.cpp | 1 - 2 files changed, 17 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index e006dd8..304b998 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -148,25 +148,21 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::AND16ri8: case X86::AND16rm: case X86::AND16rr: - case X86::AND16rr_REV: case X86::AND32i32: case X86::AND32ri: case X86::AND32ri8: case X86::AND32rm: case X86::AND32rr: - case X86::AND32rr_REV: case X86::AND64i32: case X86::AND64ri32: case X86::AND64ri8: case X86::AND64rm: case X86::AND64rr: - case X86::AND64rr_REV: case X86::AND8i8: case X86::AND8ri: case X86::AND8ri8: case X86::AND8rm: case X86::AND8rr: - case X86::AND8rr_REV: return FirstMacroFusionInstKind::And; // CMP case X86::CMP16i16: @@ -175,28 +171,24 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::CMP16ri8: case X86::CMP16rm: case X86::CMP16rr: - case X86::CMP16rr_REV: case X86::CMP32i32: case X86::CMP32mr: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP32rm: case X86::CMP32rr: - case X86::CMP32rr_REV: case X86::CMP64i32: case X86::CMP64mr: case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP64rm: case X86::CMP64rr: - case X86::CMP64rr_REV: case X86::CMP8i8: case X86::CMP8mr: case X86::CMP8ri: case X86::CMP8ri8: case X86::CMP8rm: case X86::CMP8rr: - case X86::CMP8rr_REV: return FirstMacroFusionInstKind::Cmp; // ADD case X86::ADD16i16: @@ -204,50 +196,42 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::ADD16ri8: case X86::ADD16rm: case X86::ADD16rr: - case X86::ADD16rr_REV: case X86::ADD32i32: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32rm: case X86::ADD32rr: - case X86::ADD32rr_REV: case X86::ADD64i32: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64rm: case X86::ADD64rr: - case X86::ADD64rr_REV: case X86::ADD8i8: case X86::ADD8ri: case X86::ADD8ri8: case X86::ADD8rm: case X86::ADD8rr: - case X86::ADD8rr_REV: // SUB case X86::SUB16i16: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB16rm: case X86::SUB16rr: - case X86::SUB16rr_REV: case X86::SUB32i32: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB32rm: case X86::SUB32rr: - case X86::SUB32rr_REV: case X86::SUB64i32: case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB64rm: case X86::SUB64rr: - case X86::SUB64rr_REV: case X86::SUB8i8: case X86::SUB8ri: case X86::SUB8ri8: case X86::SUB8rm: case X86::SUB8rr: - case X86::SUB8rr_REV: return FirstMacroFusionInstKind::AddSub; // INC case X86::INC16r: diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index b13bf36..aad839b 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -173,7 +173,6 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ - LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ case X86::MNEMONIC##8ri: \ -- cgit v1.1 From 0c7d46a7fd5b7956e285d385a6945153d6a06eb0 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Sat, 6 Jan 2024 22:26:34 +0800 Subject: [Clang] Correctly construct template arguments for template template parameters (#76811) This fixes the bug introduced by https://github.com/llvm/llvm-project/commit/6db007a0654ed7a6ed5c3aa3b61a937c19a6bc6b. We construct placeholder template arguments for template-template parameters to avoid mismatching argument substitution since they have different depths with their corresponding template arguments. In this case, ```cpp template