diff options
author | Philip Reames <preames@rivosinc.com> | 2025-07-29 07:52:39 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-29 07:52:39 -0700 |
commit | 73245b06b3da19ef70e04cf0f0a0d0df1ba82a57 (patch) | |
tree | dff56cdf7b800409dbb761dfe534e83423dc0f72 | |
parent | fa6965f722e0573f62e4d1e533dfa5b3a2ce2c4f (diff) | |
download | llvm-73245b06b3da19ef70e04cf0f0a0d0df1ba82a57.zip llvm-73245b06b3da19ef70e04cf0f0a0d0df1ba82a57.tar.gz llvm-73245b06b3da19ef70e04cf0f0a0d0df1ba82a57.tar.bz2 |
[RISCV] Rewrite deinterleave load as vlse optimization as DAG combine (#150049)
This reworks an existing optimization on the fixed vector (shuffle
based) deinterleave lowering into a DAG combine. This has the effect of
making it kick in much more widely - in particular on the deinterleave
intrinsic (i.e. scalable) path, deinterleaveN (without load) lowering,
but also the intrinsic lowering paths.
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 56 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp | 23 | ||||
-rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/pr141907.ll | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll | 8 |
6 files changed, 86 insertions, 41 deletions
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 607edd3..43e4f8e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20843,6 +20843,62 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case RISCVISD::TUPLE_EXTRACT: { + EVT VT = N->getValueType(0); + SDValue Tuple = N->getOperand(0); + unsigned Idx = N->getConstantOperandVal(1); + if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN) + break; + + unsigned NF = 0; + switch (Tuple.getConstantOperandVal(1)) { + default: + break; + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + break; + } + + if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF)) + break; + + unsigned SEW = VT.getScalarSizeInBits(); + assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/Tuple.getOperand(0), + /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT), + /*Passthru=*/Tuple.getOperand(2), + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/Tuple.getOperand(4), + /*VL=*/Tuple.getOperand(5), + /*Policy=*/Tuple.getOperand(6)}; + + auto TupleMemSD = cast<MemIntrinsicSDNode>(Tuple); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = TupleMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList({VT, MVT::Other}); + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, + Ops, MemVT, MMO); + DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1)); + return Result.getValue(0); + } } return SDValue(); diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 3cbe668..17e2f01 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // For rv64, need to truncate i64 to i32 to match signature. As VL is at most - // the number of active lanes (which is bounded by i32) this is safe. - VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); - CI->addParamAttr(0, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - CallInst *VlsegN = Builder.CreateIntrinsic( FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); diff --git a/llvm/test/CodeGen/RISCV/rvv/pr141907.ll b/llvm/test/CodeGen/RISCV/rvv/pr141907.ll index 648b47d..f93f88a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr141907.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr141907.ll @@ -9,27 +9,29 @@ define void @pr141907(ptr %0) nounwind { ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: addi a3, sp, 20 +; CHECK-NEXT: li a4, 12 ; CHECK-NEXT: .LBB0_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vs4r.v v8, (a2) ; CHECK-NEXT: vsetvli a1, a1, e8, mf8, ta, ma ; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v11, v9, 0, v0.t -; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v8, (a2) +; CHECK-NEXT: vnsrl.wi v9, v8, 0, v0.t +; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; CHECK-NEXT: vlse32.v v8, (a3), a4 ; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma -; CHECK-NEXT: vsseg2e16.v v11, (zero) +; CHECK-NEXT: vsseg2e16.v v9, (zero) ; CHECK-NEXT: bnez a1, .LBB0_1 ; CHECK-NEXT: .LBB0_2: # %while.body5 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: j .LBB0_2 entry: br label %vector.body diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index fba592d..c4284bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -407,8 +407,9 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: li a1, 4 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %vec = load <vscale x 32 x i8>, ptr %p %d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec) @@ -419,8 +420,10 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) { define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v5, (a0) +; CHECK-NEXT: addi a0, a0, 3 +; CHECK-NEXT: li a1, 4 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %vec = load <vscale x 32 x i8>, ptr %p %d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 5b1746d..ac9f263 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -3712,8 +3712,9 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive(<vsca ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v8, (a0) +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add sp, sp, a0 @@ -3732,9 +3733,11 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: addi a1, sp, 36 ; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v3, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a1), a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 23c0c82..2afb72f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -674,16 +674,20 @@ define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) { define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor5_oneactive: ; RV32: # %bb.0: +; RV32-NEXT: addi a0, a0, 12 +; RV32-NEXT: li a2, 20 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; RV32-NEXT: vlseg5e32.v v5, (a0) +; RV32-NEXT: vlse32.v v8, (a0), a2 ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor5_oneactive: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: addi a0, a0, 12 ; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: li a2, 20 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; RV64-NEXT: vlseg5e32.v v5, (a0) +; RV64-NEXT: vlse32.v v8, (a0), a2 ; RV64-NEXT: ret %rvl = mul nuw i32 %evl, 5 %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl) |