diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 130 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 134 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 29 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfo.h | 19 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfoZa.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 191 |
10 files changed, 442 insertions, 101 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1a697f7..502a8e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2592,6 +2592,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " + "!Subtarget->d16PreservesUnusedBits()">; + def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936..65d049e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1504,7 +1504,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); @@ -1515,6 +1514,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!F->isDeclaration() && ST.hasClusters()) A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); + if (ST.hasGFX90AInsts()) + A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + for (auto &I : instructions(F)) { Value *Ptr = nullptr; if (auto *LI = dyn_cast<LoadInst>(&I)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe2..723d07e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,8 +929,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } } }); @@ -1296,7 +1298,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index b2ff5a1..18582ed 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -951,6 +951,11 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat (inst $ptr, Offset:$offset, (i1 gds)) >; +class DSReadPat_t16 <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (EXTRACT_SUBREG (inst $ptr, Offset:$offset, (i1 gds)), lo16) +>; + multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -968,13 +973,14 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> { def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; } - let OtherPredicates = [NotLDSRequiresM0Init] in { - let True16Predicate = NotUseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; - } - let True16Predicate = UseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; - } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = NotUseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = UseTrue16WithSramECC in { + def : DSReadPat_t16<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5a22b23..e86816d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1383,6 +1383,11 @@ class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType v (inst $vaddr, $offset, (i32 0)) >; +class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1393,6 +1398,11 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value (inst $vaddr, $offset, (i32 0)) >; +class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset, (i32 0)), lo16) +>; + class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) @@ -1408,6 +1418,11 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1443,6 +1458,11 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu (inst $saddr, $voffset, $offset, $cpol) >; +class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1625,6 +1645,11 @@ class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $offset, 0) >; +class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) @@ -1645,6 +1670,11 @@ class ScratchLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Val (inst $saddr, $offset, 0) >; +class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), + (EXTRACT_SUBREG (inst $saddr, $offset), lo16) +>; + class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), @@ -1672,6 +1702,11 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16) +>; + multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat_M0 <inst, node> { let AddedComplexity = 10; @@ -1764,6 +1799,16 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value } } +multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatSignedLoadPat_t16<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStoreSignedPat <inst, node, vt> { @@ -1872,8 +1917,8 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, } } -multiclass ScratchFLATStorePats_t16<string inst, SDPatternOperator node, - ValueType vt> { +multiclass ScratchFLATStorePats_D16_t16<string inst, SDPatternOperator node, + ValueType vt> { def : ScratchStoreSignedPat <!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> { let AddedComplexity = 25; } @@ -1918,6 +1963,21 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : ScratchLoadSignedPat_t16 <inst, node, vt> { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadPat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1947,6 +2007,17 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT } } +multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_t16 <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } + + def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStorePat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1997,6 +2068,17 @@ let True16Predicate = NotUseRealTrue16Insts in { defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } +let True16Predicate = UseTrue16WithSramECC in { + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; @@ -2006,11 +2088,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; +} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; -} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts +} defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; @@ -2140,6 +2225,20 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; } +let True16Predicate = UseTrue16WithSramECC in { +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; @@ -2192,6 +2291,13 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; } +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = UseRealTrue16Insts in { +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", store_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>; +} + let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; @@ -2362,14 +2468,24 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>; } -let True16Predicate = UseRealTrue16Insts in { +let True16Predicate = UseTrue16WithSramECC in { +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>; +} + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; -} // End True16Predicate = UseRealTrue16Insts +} // End OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts + +let True16Predicate = UseRealTrue16Insts in { +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; +} foreach vt = Reg32Types.types in { defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6234714..a3a4cf2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16498,43 +16498,60 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue X = N->getOperand(0); if (Subtarget.hasShlAdd(3)) { - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 2^N -> shl (shXadd X, X), N - if (isPowerOf2_64(MulAmt2)) { - SDLoc DL(N); - SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the - // shift forming a slli.uw. - if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && - X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), - Shl); - } - // Otherwise, put rhe shl second so that it can fold with following - // instructions (e.g. sext or add). - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(ISD::SHL, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - } - - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT), - Mul359); + int Shift; + if (int ShXAmount = isShifted359(MulAmt, Shift)) { + // 3/5/9 * 2^N -> shl (shXadd X, X), N + SDLoc DL(N); + SDValue X = N->getOperand(0); + // Put the shift first if we can fold a zext into the shift forming + // a slli.uw. + if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && + X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getConstant(ShXAmount, DL, VT), Shl); } + // Otherwise, put the shl second so that it can fold with following + // instructions (e.g. sext or add). + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); + return DAG.getNode(ISD::SHL, DL, VT, Mul359, + DAG.getConstant(Shift, DL, VT)); + } + + // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) + int ShX; + int ShY; + switch (MulAmt) { + case 3 * 5: + ShY = 1; + ShX = 2; + break; + case 3 * 9: + ShY = 1; + ShX = 3; + break; + case 5 * 5: + ShX = ShY = 2; + break; + case 5 * 9: + ShY = 2; + ShX = 3; + break; + case 9 * 9: + ShX = ShY = 3; + break; + default: + ShX = ShY = 0; + break; + } + if (ShX) { + SDLoc DL(N); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(ShX, DL, VT), Mul359); } // If this is a power 2 + 2/4/8, we can use a shift followed by a single @@ -16557,18 +16574,14 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // variants we could implement. e.g. // (2^(1,2,3) * 3,5,9 + 1) << C2 // 2^(C1>3) * 3,5,9 +/- 1 - for (uint64_t Divisor : {3, 5, 9}) { - uint64_t C = MulAmt - 1; - if (C <= Divisor) - continue; - unsigned TZ = llvm::countr_zero(C); - if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) { + if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { + assert(Shift != 0 && "MulAmt=4,6,10 handled before"); + if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(TZ, DL, VT), X); + DAG.getConstant(Shift, DL, VT), X); } } @@ -16576,7 +16589,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); + unsigned ShiftAmt = llvm::countr_zero((MulAmt - 1) & (MulAmt - 2)); SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); @@ -16589,7 +16602,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) for (uint64_t Offset : {3, 5, 9}) { if (isPowerOf2_64(MulAmt + Offset)) { - unsigned ShAmt = Log2_64(MulAmt + Offset); + unsigned ShAmt = llvm::countr_zero(MulAmt + Offset); if (ShAmt >= VT.getSizeInBits()) continue; SDLoc DL(N); @@ -16608,21 +16621,16 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt2 = MulAmt / Divisor; // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples // of 25 which happen to be quite common. - for (uint64_t Divisor2 : {3, 5, 9}) { - if (MulAmt2 % Divisor2 != 0) - continue; - uint64_t MulAmt3 = MulAmt2 / Divisor2; - if (isPowerOf2_64(MulAmt3)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = DAG.getNode( - RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(Log2_64(Divisor2 - 1), DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Log2_64(MulAmt3), DL, VT)); - } + if (int ShBAmount = isShifted359(MulAmt2, Shift)) { + SDLoc DL(N); + SDValue Mul359A = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359B = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, + DAG.getConstant(ShBAmount, DL, VT), Mul359A); + return DAG.getNode(ISD::SHL, DL, VT, Mul359B, + DAG.getConstant(Shift, DL, VT)); } } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7db4832..96e1078 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4586,24 +4586,23 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(DestReg, RegState::Kill) .addImm(ShiftAmount) .setMIFlag(Flag); - } else if (STI.hasShlAdd(3) && - ((Amount % 3 == 0 && isPowerOf2_64(Amount / 3)) || - (Amount % 5 == 0 && isPowerOf2_64(Amount / 5)) || - (Amount % 9 == 0 && isPowerOf2_64(Amount / 9)))) { + } else if (int ShXAmount, ShiftAmount; + STI.hasShlAdd(3) && + (ShXAmount = isShifted359(Amount, ShiftAmount)) != 0) { // We can use Zba SHXADD+SLLI instructions for multiply in some cases. unsigned Opc; - uint32_t ShiftAmount; - if (Amount % 9 == 0) { - Opc = RISCV::SH3ADD; - ShiftAmount = Log2_64(Amount / 9); - } else if (Amount % 5 == 0) { - Opc = RISCV::SH2ADD; - ShiftAmount = Log2_64(Amount / 5); - } else if (Amount % 3 == 0) { + switch (ShXAmount) { + case 1: Opc = RISCV::SH1ADD; - ShiftAmount = Log2_64(Amount / 3); - } else { - llvm_unreachable("implied by if-clause"); + break; + case 2: + Opc = RISCV::SH2ADD; + break; + case 3: + Opc = RISCV::SH3ADD; + break; + default: + llvm_unreachable("unexpected result of isShifted359"); } if (ShiftAmount) BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 42a0c4c..c5eddb9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -25,6 +25,25 @@ namespace llvm { +// If Value is of the form C1<<C2, where C1 = 3, 5 or 9, +// returns log2(C1 - 1) and assigns Shift = C2. +// Otherwise, returns 0. +template <typename T> int isShifted359(T Value, int &Shift) { + if (Value == 0) + return 0; + Shift = llvm::countr_zero(Value); + switch (Value >> Shift) { + case 3: + return 1; + case 5: + return 2; + case 9: + return 3; + default: + return 0; + } +} + class RISCVSubtarget; static const MachineMemOperand::Flags MONontemporalBit0 = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td index 7cf6d5f..87b9f45 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -9,8 +9,8 @@ // This file describes the RISC-V instructions from the standard atomic 'Za*' // extensions: // - Zawrs (v1.0) : Wait-on-Reservation-Set. -// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap. -// - Zabha (v1.0-rc1) : Byte and Halfword Atomic Memory Operations. +// - Zacas (v1.0) : Atomic Compare-and-Swap. +// - Zabha (v1.0) : Byte and Halfword Atomic Memory Operations. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ee25f69..7bc0b5b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2747,20 +2747,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Intrinsic::ID IID = Inst->getIntrinsicID(); LLVMContext &C = Inst->getContext(); bool HasMask = false; + + auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo, + bool IsWrite) -> int64_t { + if (auto *TarExtTy = + dyn_cast<TargetExtType>(II->getArgOperand(0)->getType())) + return TarExtTy->getIntParameter(0); + + return 1; + }; + switch (IID) { case Intrinsic::riscv_vle_mask: case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vle: - case Intrinsic::riscv_vse: { + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vlseg2: + case Intrinsic::riscv_vlseg3: + case Intrinsic::riscv_vlseg4: + case Intrinsic::riscv_vlseg5: + case Intrinsic::riscv_vlseg6: + case Intrinsic::riscv_vlseg7: + case Intrinsic::riscv_vlseg8: + case Intrinsic::riscv_vsseg2: + case Intrinsic::riscv_vsseg3: + case Intrinsic::riscv_vsseg4: + case Intrinsic::riscv_vsseg5: + case Intrinsic::riscv_vsseg6: + case Intrinsic::riscv_vsseg7: + case Intrinsic::riscv_vsseg8: { // Intrinsic interface: // riscv_vle(merge, ptr, vl) // riscv_vle_mask(merge, ptr, mask, vl, policy) // riscv_vse(val, ptr, vl) // riscv_vse_mask(val, ptr, mask, vl, policy) + // riscv_vlseg#(merge, ptr, vl, sew) + // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew) + // riscv_vsseg#(val, ptr, vl, sew) + // riscv_vsseg#_mask(val, ptr, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 1 - HasMask; @@ -2771,23 +2823,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL); return true; } case Intrinsic::riscv_vlse_mask: case Intrinsic::riscv_vsse_mask: + case Intrinsic::riscv_vlsseg2_mask: + case Intrinsic::riscv_vlsseg3_mask: + case Intrinsic::riscv_vlsseg4_mask: + case Intrinsic::riscv_vlsseg5_mask: + case Intrinsic::riscv_vlsseg6_mask: + case Intrinsic::riscv_vlsseg7_mask: + case Intrinsic::riscv_vlsseg8_mask: + case Intrinsic::riscv_vssseg2_mask: + case Intrinsic::riscv_vssseg3_mask: + case Intrinsic::riscv_vssseg4_mask: + case Intrinsic::riscv_vssseg5_mask: + case Intrinsic::riscv_vssseg6_mask: + case Intrinsic::riscv_vssseg7_mask: + case Intrinsic::riscv_vssseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vlse: - case Intrinsic::riscv_vsse: { + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vlsseg2: + case Intrinsic::riscv_vlsseg3: + case Intrinsic::riscv_vlsseg4: + case Intrinsic::riscv_vlsseg5: + case Intrinsic::riscv_vlsseg6: + case Intrinsic::riscv_vlsseg7: + case Intrinsic::riscv_vlsseg8: + case Intrinsic::riscv_vssseg2: + case Intrinsic::riscv_vssseg3: + case Intrinsic::riscv_vssseg4: + case Intrinsic::riscv_vssseg5: + case Intrinsic::riscv_vssseg6: + case Intrinsic::riscv_vssseg7: + case Intrinsic::riscv_vssseg8: { // Intrinsic interface: // riscv_vlse(merge, ptr, stride, vl) // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy) // riscv_vsse(val, ptr, stride, vl) // riscv_vsse_mask(val, ptr, stride, mask, vl, policy) + // riscv_vlsseg#(merge, ptr, offset, vl, sew) + // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew) + // riscv_vssseg#(val, ptr, offset, vl, sew) + // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2809,6 +2910,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL, Stride); return true; @@ -2817,19 +2925,89 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::riscv_vluxei_mask: case Intrinsic::riscv_vsoxei_mask: case Intrinsic::riscv_vsuxei_mask: + case Intrinsic::riscv_vloxseg2_mask: + case Intrinsic::riscv_vloxseg3_mask: + case Intrinsic::riscv_vloxseg4_mask: + case Intrinsic::riscv_vloxseg5_mask: + case Intrinsic::riscv_vloxseg6_mask: + case Intrinsic::riscv_vloxseg7_mask: + case Intrinsic::riscv_vloxseg8_mask: + case Intrinsic::riscv_vluxseg2_mask: + case Intrinsic::riscv_vluxseg3_mask: + case Intrinsic::riscv_vluxseg4_mask: + case Intrinsic::riscv_vluxseg5_mask: + case Intrinsic::riscv_vluxseg6_mask: + case Intrinsic::riscv_vluxseg7_mask: + case Intrinsic::riscv_vluxseg8_mask: + case Intrinsic::riscv_vsoxseg2_mask: + case Intrinsic::riscv_vsoxseg3_mask: + case Intrinsic::riscv_vsoxseg4_mask: + case Intrinsic::riscv_vsoxseg5_mask: + case Intrinsic::riscv_vsoxseg6_mask: + case Intrinsic::riscv_vsoxseg7_mask: + case Intrinsic::riscv_vsoxseg8_mask: + case Intrinsic::riscv_vsuxseg2_mask: + case Intrinsic::riscv_vsuxseg3_mask: + case Intrinsic::riscv_vsuxseg4_mask: + case Intrinsic::riscv_vsuxseg5_mask: + case Intrinsic::riscv_vsuxseg6_mask: + case Intrinsic::riscv_vsuxseg7_mask: + case Intrinsic::riscv_vsuxseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vloxei: case Intrinsic::riscv_vluxei: case Intrinsic::riscv_vsoxei: - case Intrinsic::riscv_vsuxei: { + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vloxseg2: + case Intrinsic::riscv_vloxseg3: + case Intrinsic::riscv_vloxseg4: + case Intrinsic::riscv_vloxseg5: + case Intrinsic::riscv_vloxseg6: + case Intrinsic::riscv_vloxseg7: + case Intrinsic::riscv_vloxseg8: + case Intrinsic::riscv_vluxseg2: + case Intrinsic::riscv_vluxseg3: + case Intrinsic::riscv_vluxseg4: + case Intrinsic::riscv_vluxseg5: + case Intrinsic::riscv_vluxseg6: + case Intrinsic::riscv_vluxseg7: + case Intrinsic::riscv_vluxseg8: + case Intrinsic::riscv_vsoxseg2: + case Intrinsic::riscv_vsoxseg3: + case Intrinsic::riscv_vsoxseg4: + case Intrinsic::riscv_vsoxseg5: + case Intrinsic::riscv_vsoxseg6: + case Intrinsic::riscv_vsoxseg7: + case Intrinsic::riscv_vsoxseg8: + case Intrinsic::riscv_vsuxseg2: + case Intrinsic::riscv_vsuxseg3: + case Intrinsic::riscv_vsuxseg4: + case Intrinsic::riscv_vsuxseg5: + case Intrinsic::riscv_vsuxseg6: + case Intrinsic::riscv_vsuxseg7: + case Intrinsic::riscv_vsuxseg8: { // Intrinsic interface (only listed ordered version): // riscv_vloxei(merge, ptr, index, vl) // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy) // riscv_vsoxei(val, ptr, index, vl) // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy) + // riscv_vloxseg#(merge, ptr, index, vl, sew) + // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew) + // riscv_vsoxseg#(val, ptr, index, vl, sew) + // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2845,6 +3023,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Mask = ConstantInt::getTrue(MaskType); } Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1); Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Align(1), Mask, EVL, |