[IA] Support vp.store in lowerinterleavedStore (#149605)

Follow up to 28417e64, and the whole line of work started with 4b81dc7. This change merges the handling for VPStore - currently in lowerInterleavedVPStore - into the existing dedicated routine used in the shuffle lowering path. This removes the last use of the dedicated lowerInterleavedVPStore and thus we can remove it. This contains two changes which are functional. First, like in 28417e64, merging support for vp.store exposes the strided store optimization for code using vp.store. Second, it seems the strided store case had a significant missed optimization. We were performing the strided store at the full unit strided store type width (i.e. LMUL) rather than reducing it to match the input width. This became obvious when I tried to use the mask created by the helper routine as it caused a type incompatibility. Normally, I'd try not to include an optimization in an API rework, but structuring the code to both be correct for vp.store and not optimize the existing case turned out be more involved than seemed worthwhile. I could pull this part out as a pre-change, but its a bit awkward on it's own as it turns out to be somewhat of a half step on the possible optimization; the full optimization is complex with the old code structure. --------- Co-authored-by: Craig Topper <craig.topper@sifive.com>
author: Philip Reames <preames@rivosinc.com> 2025-07-22 15:50:17 -0700
committer: GitHub <noreply@github.com> 2025-07-22 15:50:17 -0700
commit: dbd9eae95a5ad113d4559c07839306bf68f1925c (patch)
tree: 8e1f2e23009e73f5e6ce31f0d5c0dbd03fbe3211 /llvm/lib
parent: 5edb845fcaafd56110670fb2f6eeac22cbb0afff (diff)
download: llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.zip
llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.tar.gz
llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.tar.bz2
9 files changed, 64 insertions, 155 deletions
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1298aea..1b69188 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -507,46 +507,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
 
+  Value *Mask = nullptr;
   if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
-                              ElementCount::getFixed(LaneMaskLen));
-    if (!LaneMask)
+    Mask = getMask(VPStore->getMaskParam(), Factor,
+                   ElementCount::getFixed(LaneMaskLen));
+    if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
                       << "\n");
 
-    IRBuilder<> Builder(VPStore);
-    // We need to effectively de-interleave the shufflemask
-    // because lowerInterleavedVPStore expects individual de-interleaved
-    // values.
-    SmallVector<Value *, 10> NewShuffles;
-    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
-    auto ShuffleMask = SVI->getShuffleMask();
-
-    for (unsigned i = 0; i < Factor; i++) {
-      for (unsigned j = 0; j < LaneMaskLen; j++)
-        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
-
-      NewShuffles.push_back(Builder.CreateShuffleVector(
-          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
-    }
-
-    // Try to create target specific intrinsics to replace the vp.store and
-    // shuffle.
-    if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
-      // We already created new shuffles.
-      return true;
   } else {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
-
-    // Try to create target specific intrinsics to replace the store and
-    // shuffle.
-    if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
-      return false;
   }
 
+  // Try to create target specific intrinsics to replace the store and
+  // shuffle.
+  if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
+    return false;
+
   // Already have a new target specific interleaved store. Erase the old store.
   DeadInsts.insert(Store);
   DeadInsts.insert(SVI);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f026726..02ee517 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
+                                                  Value *LaneMask,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
 
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 713793e..d8403c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,7 +215,8 @@ public:
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                             ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fd3b052..8b7f06a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21731,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
+                                              Value *LaneMask,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9159f3d..825145d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -685,7 +685,8 @@ class VectorType;
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                               ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e0a8c07..f0447e0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -434,7 +434,8 @@ public:
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
 
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                             ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
@@ -444,9 +445,6 @@ public:
       Instruction *Store, Value *Mask,
       ArrayRef<Value *> InterleaveValues) const override;
 
-  bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
-                               ArrayRef<Value *> InterleaveOps) const override;
-
   bool supportKCFIBundles() const override { return true; }
 
   SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 878401e..272b347 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -266,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
+                                                Value *LaneMask,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
-  IRBuilder<> Builder(SI);
-  const DataLayout &DL = SI->getDataLayout();
+  IRBuilder<> Builder(Store);
+  const DataLayout &DL = Store->getDataLayout();
   auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
                                    ShuffleVTy->getNumElements() / Factor);
-  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
+  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
     return false;
 
-  auto *PtrTy = SI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = PtrTy->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+    return false;
 
   unsigned Index;
   // If the segment store only has one active lane (i.e. the interleave is
@@ -292,27 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
     unsigned ScalarSizeInBytes =
         DL.getTypeStoreSize(ShuffleVTy->getElementType());
     Value *Data = SVI->getOperand(0);
-    auto *DataVTy = cast<FixedVectorType>(Data->getType());
+    Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
-    Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
-                                           VTy->getElementCount());
-
-    CallInst *CI = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vp_strided_store,
-        {Data->getType(), BasePtr->getType(), Stride->getType()},
-        {Data, BasePtr, Stride, Mask, VL});
-    Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes);
-    CI->addParamAttr(
-        1, Attribute::getWithAlignment(CI->getContext(), Alignment));
+    Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+    // Note: Same VL as above, but i32 not xlen due to signature of
+    // vp.strided.store
+    VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+                                    VTy->getElementCount());
 
+    CallInst *CI =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+                                {VTy, BasePtr->getType(), Stride->getType()},
+                                {Data, BasePtr, Stride, LaneMask, VL});
+    Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
+    CI->addParamAttr(1,
+                     Attribute::getWithAlignment(CI->getContext(), Alignment));
     return true;
   }
 
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+      Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
 
   SmallVector<Value *, 10> Ops;
   SmallVector<int, 16> NewShuffleMask;
@@ -328,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
     NewShuffleMask.clear();
   }
-  // This VL should be OK (should be executable in one vsseg instruction,
-  // potentially under larger LMULs) because we checked that the fixed vector
-  // type fits in isLegalInterleavedAccessType
-  Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
-  Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
-  Ops.append({SI->getPointerOperand(), StoreMask, VL});
-
+  Ops.append({Ptr, LaneMask, VL});
   Builder.CreateCall(VssegNFunc, Ops);
 
   return true;
@@ -457,91 +457,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   Builder.CreateCall(VssegNFunc, Operands);
   return true;
 }
-
-/// Lower an interleaved vp.store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.store (Factor = 2):
-///
-///   %is = tail call <vscale x 64 x i8>
-///             @llvm.vector.interleave2.nxv64i8(
-///                               <vscale x 32 x i8> %load0,
-///                               <vscale x 32 x i8> %load1
-///   %wide.rvl = shl nuw nsw i32 %rvl, 1
-///   tail call void @llvm.vp.store.nxv64i8.p0(
-///                               <vscale x 64 x i8> %is, ptr %ptr,
-///                               %mask,
-///                               i32 %wide.rvl)
-///
-/// Into:
-///   call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
-///                               <vscale x 32 x i8> %load1,
-///                               <vscale x 32 x i8> %load2, ptr %ptr,
-///                               %mask,
-///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedVPStore(
-    VPIntrinsic *Store, Value *Mask,
-    ArrayRef<Value *> InterleaveOperands) const {
-  assert(Mask && "Expect a valid mask");
-  assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
-         "Unexpected intrinsic");
-
-  const unsigned Factor = InterleaveOperands.size();
-
-  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
-  if (!VTy)
-    return false;
-
-  const DataLayout &DL = Store->getDataLayout();
-  Align Alignment = Store->getParamAlign(1).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Store);
-  Value *WideEVL = Store->getArgOperand(3);
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Store->getArgOperand(1)->getType();
-  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
-  auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
-  Value *EVL =
-      Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
-
-  if (isa<FixedVectorType>(VTy)) {
-    SmallVector<Value *, 8> Operands(InterleaveOperands);
-    Operands.append({Store->getArgOperand(1), Mask, EVL});
-    Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
-                            {VTy, PtrTy, XLenTy}, Operands);
-    return true;
-  }
-
-  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-  Type *VecTupTy = TargetExtType::get(
-      Store->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
-
-  Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
-  Value *StoredVal = PoisonValue::get(VecTupTy);
-  for (unsigned i = 0; i < Factor; ++i)
-    StoredVal = Builder.CreateCall(
-        VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
-
-  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), ScalableVssegIntrIds[Factor - 2],
-      {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
-  Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
-                       ConstantInt::get(XLenTy, Log2_64(SEW))};
-
-  Builder.CreateCall(VssegNFunc, Operands);
-  return true;
-}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2636979..547b221 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1668,7 +1668,8 @@ namespace llvm {
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                               ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 360293bc..636b072 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
 
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
+                                              Value *LaneMask,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
              0 &&
          "Invalid interleaved store");
 
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
+
   // Holds the indices of SVI that correspond to the starting index of each
   // interleaved shuffle.
   auto Mask = SVI->getShuffleMask();
author	Philip Reames <preames@rivosinc.com>	2025-07-22 15:50:17 -0700
committer	GitHub <noreply@github.com>	2025-07-22 15:50:17 -0700
commit	dbd9eae95a5ad113d4559c07839306bf68f1925c (patch)
tree	8e1f2e23009e73f5e6ce31f0d5c0dbd03fbe3211 /llvm/lib
parent	5edb845fcaafd56110670fb2f6eeac22cbb0afff (diff)
download	llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.zip llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.tar.gz llvm-dbd9eae95a5ad113d4559c07839306bf68f1925c.tar.bz2