17 files changed, 494 insertions, 89 deletions
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 18fd3f8..fafc137 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -187,6 +187,7 @@ add_header_macro(
   arpa/inet.h
   DEPENDS
     .llvm_libc_common_h
+    .inttypes
 )
 
 add_header_macro(
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index c22f985..76eb0a2 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -44,6 +44,7 @@ add_header_library(
   HDRS
     core_structs.h
   DEPENDS
+    libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
 )
@@ -97,6 +98,7 @@ add_header_library(
     .core_structs
     .printf_config
     .writer
+    libc.include.inttypes
     libc.src.__support.big_int
     libc.src.__support.common
     libc.src.__support.CPP.limits
diff --git a/lldb/tools/lldb-rpc-gen/RPCCommon.cpp b/lldb/tools/lldb-rpc-gen/RPCCommon.cpp
index 34791fa..6f0abe4 100644
--- a/lldb/tools/lldb-rpc-gen/RPCCommon.cpp
+++ b/lldb/tools/lldb-rpc-gen/RPCCommon.cpp
@@ -194,7 +194,6 @@ std::string lldb_rpc_gen::GetMangledName(ASTContext &Context,
   return Mangled;
 }
 
-static auto CheckTypeForLLDBPrivate = [](const Type *Ty) {};
 bool lldb_rpc_gen::TypeIsFromLLDBPrivate(QualType T) {
   auto CheckTypeForLLDBPrivate = [](const Type *Ty) {
     if (!Ty)
diff --git a/lldb/tools/lldb-rpc-gen/lldb-rpc-gen.cpp b/lldb/tools/lldb-rpc-gen/lldb-rpc-gen.cpp
index fdcfee9..9b48796 100644
--- a/lldb/tools/lldb-rpc-gen/lldb-rpc-gen.cpp
+++ b/lldb/tools/lldb-rpc-gen/lldb-rpc-gen.cpp
@@ -102,8 +102,6 @@ public:
           lldb_rpc_gen::GetMangledName(Context, MDecl);
       const bool IsDisallowed =
           lldb_rpc_gen::MethodIsDisallowed(Context, MDecl);
-      const bool HasCallbackParameter =
-          lldb_rpc_gen::HasCallbackParameter(MDecl);
       SupportLevel MethodSupportLevel = GetMethodSupportLevel(MDecl);
       if (MethodSupportLevel == eImplemented && !IsDisallowed) {
         const lldb_rpc_gen::Method Method(MDecl, Policy, Context);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1b69188..65565b9 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -253,6 +253,21 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+static Value *getMaskOperand(IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  case Intrinsic::vp_load:
+    return II->getOperand(1);
+  case Intrinsic::masked_load:
+    return II->getOperand(2);
+  case Intrinsic::vp_store:
+    return II->getOperand(2);
+  case Intrinsic::masked_store:
+    return II->getOperand(3);
+  }
+}
+
 // Return the corresponded deinterleaved mask, or nullptr if there is no valid
 // mask.
 static Value *getMask(Value *WideMask, unsigned Factor,
@@ -268,8 +283,12 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   if (isa<ScalableVectorType>(Load->getType()))
     return false;
 
-  if (auto *LI = dyn_cast<LoadInst>(Load);
-      LI && !LI->isSimple())
+  auto *LI = dyn_cast<LoadInst>(Load);
+  auto *II = dyn_cast<IntrinsicInst>(Load);
+  if (!LI && !II)
+    return false;
+
+  if (LI && !LI->isSimple())
     return false;
 
   // Check if all users of this load are shufflevectors. If we encounter any
@@ -322,7 +341,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // Holds the corresponding index for each DE-interleave shuffle.
   SmallVector<unsigned, 4> Indices;
 
-  Type *VecTy = FirstSVI->getType();
+  VectorType *VecTy = cast<VectorType>(FirstSVI->getType());
 
   // Check if other shufflevectors are also DE-interleaved of the same type
   // and factor as the first shufflevector.
@@ -360,13 +379,16 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   Value *Mask = nullptr;
-  if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
-    Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+  if (LI) {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+  } else {
+    // Check mask operand. Handle both all-true/false and interleaved mask.
+    Mask = getMask(getMaskOperand(II), Factor, VecTy);
     if (!Mask)
       return false;
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
-  } else {
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
+                      << *Load << "\n");
   }
 
   // Try to create target specific intrinsics to replace the load and
@@ -483,15 +505,16 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 bool InterleavedAccessImpl::lowerInterleavedStore(
     Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
   Value *StoredValue;
-  if (auto *SI = dyn_cast<StoreInst>(Store)) {
+  auto *SI = dyn_cast<StoreInst>(Store);
+  auto *II = dyn_cast<IntrinsicInst>(Store);
+  if (SI) {
     if (!SI->isSimple())
       return false;
     StoredValue = SI->getValueOperand();
-  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
-    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
-    StoredValue = VPStore->getArgOperand(0);
   } else {
-    llvm_unreachable("unsupported store operation");
+    assert(II->getIntrinsicID() == Intrinsic::vp_store ||
+           II->getIntrinsicID() == Intrinsic::masked_store);
+    StoredValue = II->getArgOperand(0);
   }
 
   auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
@@ -508,18 +531,18 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
          "number of stored element should be a multiple of Factor");
 
   Value *Mask = nullptr;
-  if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
+  if (SI) {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+  } else {
+    // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Mask = getMask(VPStore->getMaskParam(), Factor,
+    Mask = getMask(getMaskOperand(II), Factor,
                    ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
-                      << "\n");
-
-  } else {
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
+                      << *Store << "\n");
   }
 
   // Try to create target specific intrinsics to replace the store and
@@ -592,19 +615,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     assert(II);
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Value *WideMask;
-    switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::vp_load:
-      WideMask = II->getOperand(1);
-      break;
-    case  Intrinsic::masked_load:
-      WideMask = II->getOperand(2);
-      break;
-    }
-
-    Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
+    Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
 
@@ -642,18 +653,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   Value *Mask = nullptr;
   if (II) {
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Value *WideMask;
-    switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::vp_store:
-      WideMask = II->getOperand(2);
-      break;
-    case Intrinsic::masked_store:
-      WideMask = II->getOperand(3);
-      break;
-    }
-    Mask = getMask(WideMask, Factor,
+    Mask = getMask(getMaskOperand(II), Factor,
                    cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
@@ -687,11 +687,13 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   using namespace PatternMatch;
   for (auto &I : instructions(F)) {
     if (match(&I, m_CombineOr(m_Load(m_Value()),
-                              m_Intrinsic<Intrinsic::vp_load>())))
+                              m_Intrinsic<Intrinsic::vp_load>())) ||
+        match(&I, m_Intrinsic<Intrinsic::masked_load>()))
       Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
     if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
-                              m_Intrinsic<Intrinsic::vp_store>())))
+                              m_Intrinsic<Intrinsic::vp_store>())) ||
+        match(&I, m_Intrinsic<Intrinsic::masked_store>()))
       Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index f9c0b54..171940e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t
 def FeatureVendorXSfvqmaccdod
     : RISCVExtension<1, 0,
                      "SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)",
-                     [FeatureStdExtZve32x]>;
+                     [FeatureStdExtZve32x, FeatureStdExtZvl128b]>;
 def HasVendorXSfvqmaccdod
     : Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
       AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
@@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod
 def FeatureVendorXSfvqmaccqoq
     : RISCVExtension<1, 0,
                      "SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)",
-                     [FeatureStdExtZve32x]>;
+                     [FeatureStdExtZve32x, FeatureStdExtZvl256b]>;
 def HasVendorXSfvqmaccqoq
     : Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
       AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
@@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq
 def FeatureVendorXSfvfwmaccqqq
     : RISCVExtension<1, 0,
                      "SiFive Matrix Multiply Accumulate Instruction (4-by-4)",
-                     [FeatureStdExtZvfbfmin]>;
+                     [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>;
 def HasVendorXSfvfwmaccqqq
     : Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
       AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index 61a915a..086ef54 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -352,6 +352,81 @@ entry:
   ret void
 }
 
+define void @buildvector_v32i8_partial(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5, i8 %a7, i8 %a8, i8 %a15, i8 %a17, i8 %a18, i8 %a20, i8 %a22, i8 %a23, i8 %a27, i8 %a28, i8 %a31) nounwind {
+; CHECK-LABEL: buildvector_v32i8_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT:    ld.b $t0, $fp, 0
+; CHECK-NEXT:    ld.b $t1, $fp, 8
+; CHECK-NEXT:    ld.b $t2, $fp, 16
+; CHECK-NEXT:    ld.b $t3, $fp, 24
+; CHECK-NEXT:    ld.b $t4, $fp, 56
+; CHECK-NEXT:    ld.b $t5, $fp, 48
+; CHECK-NEXT:    ld.b $t6, $fp, 40
+; CHECK-NEXT:    ld.b $t7, $fp, 32
+; CHECK-NEXT:    st.b $t4, $sp, 63
+; CHECK-NEXT:    st.b $t5, $sp, 60
+; CHECK-NEXT:    st.b $t6, $sp, 59
+; CHECK-NEXT:    st.b $t7, $sp, 55
+; CHECK-NEXT:    st.b $t3, $sp, 54
+; CHECK-NEXT:    st.b $t2, $sp, 52
+; CHECK-NEXT:    st.b $t1, $sp, 50
+; CHECK-NEXT:    st.b $t0, $sp, 49
+; CHECK-NEXT:    st.b $a7, $sp, 47
+; CHECK-NEXT:    st.b $a6, $sp, 40
+; CHECK-NEXT:    st.b $a5, $sp, 39
+; CHECK-NEXT:    st.b $a4, $sp, 37
+; CHECK-NEXT:    st.b $a3, $sp, 34
+; CHECK-NEXT:    st.b $a2, $sp, 33
+; CHECK-NEXT:    st.b $a1, $sp, 32
+; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <32 x i8> undef,  i8   %a0,  i32 0
+  %ins1  = insertelement <32 x i8> %ins0,  i8   %a1,  i32 1
+  %ins2  = insertelement <32 x i8> %ins1,  i8   %a2,  i32 2
+  %ins3  = insertelement <32 x i8> %ins2,  i8 undef,  i32 3
+  %ins4  = insertelement <32 x i8> %ins3,  i8 undef,  i32 4
+  %ins5  = insertelement <32 x i8> %ins4,  i8   %a5,  i32 5
+  %ins6  = insertelement <32 x i8> %ins5,  i8 undef,  i32 6
+  %ins7  = insertelement <32 x i8> %ins6,  i8   %a7,  i32 7
+  %ins8  = insertelement <32 x i8> %ins7,  i8   %a8,  i32 8
+  %ins9  = insertelement <32 x i8> %ins8,  i8 undef,  i32 9
+  %ins10 = insertelement <32 x i8> %ins9,  i8 undef, i32 10
+  %ins11 = insertelement <32 x i8> %ins10, i8 undef, i32 11
+  %ins12 = insertelement <32 x i8> %ins11, i8 undef, i32 12
+  %ins13 = insertelement <32 x i8> %ins12, i8 undef, i32 13
+  %ins14 = insertelement <32 x i8> %ins13, i8 undef, i32 14
+  %ins15 = insertelement <32 x i8> %ins14, i8  %a15, i32 15
+  %ins16 = insertelement <32 x i8> %ins15, i8 undef, i32 16
+  %ins17 = insertelement <32 x i8> %ins16, i8  %a17, i32 17
+  %ins18 = insertelement <32 x i8> %ins17, i8  %a18, i32 18
+  %ins19 = insertelement <32 x i8> %ins18, i8 undef, i32 19
+  %ins20 = insertelement <32 x i8> %ins19, i8  %a20, i32 20
+  %ins21 = insertelement <32 x i8> %ins20, i8 undef, i32 21
+  %ins22 = insertelement <32 x i8> %ins21, i8  %a22, i32 22
+  %ins23 = insertelement <32 x i8> %ins22, i8  %a23, i32 23
+  %ins24 = insertelement <32 x i8> %ins23, i8 undef, i32 24
+  %ins25 = insertelement <32 x i8> %ins24, i8 undef, i32 25
+  %ins26 = insertelement <32 x i8> %ins25, i8 undef, i32 26
+  %ins27 = insertelement <32 x i8> %ins26, i8  %a27, i32 27
+  %ins28 = insertelement <32 x i8> %ins27, i8  %a28, i32 28
+  %ins29 = insertelement <32 x i8> %ins28, i8 undef, i32 29
+  %ins30 = insertelement <32 x i8> %ins29, i8 undef, i32 30
+  %ins31 = insertelement <32 x i8> %ins30, i8  %a31, i32 31
+  store <32 x i8> %ins31, ptr %dst
+  ret void
+}
+
 define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
 ; CHECK-LABEL: buildvector_v16i16:
 ; CHECK:       # %bb.0: # %entry
@@ -419,6 +494,49 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i16_partial(ptr %dst, i16 %a0, i16 %a2, i16 %a5, i16 %a6, i16 %a7, i16 %a12, i16 %a13) nounwind {
+; CHECK-LABEL: buildvector_v16i16_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT:    st.h $a7, $sp, 58
+; CHECK-NEXT:    st.h $a6, $sp, 56
+; CHECK-NEXT:    st.h $a5, $sp, 46
+; CHECK-NEXT:    st.h $a4, $sp, 44
+; CHECK-NEXT:    st.h $a3, $sp, 42
+; CHECK-NEXT:    st.h $a2, $sp, 36
+; CHECK-NEXT:    st.h $a1, $sp, 32
+; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <16 x i16> undef,  i16   %a0,  i32 0
+  %ins1  = insertelement <16 x i16> %ins0,  i16 undef,  i32 1
+  %ins2  = insertelement <16 x i16> %ins1,  i16   %a2,  i32 2
+  %ins3  = insertelement <16 x i16> %ins2,  i16 undef,  i32 3
+  %ins4  = insertelement <16 x i16> %ins3,  i16 undef,  i32 4
+  %ins5  = insertelement <16 x i16> %ins4,  i16   %a5,  i32 5
+  %ins6  = insertelement <16 x i16> %ins5,  i16   %a6,  i32 6
+  %ins7  = insertelement <16 x i16> %ins6,  i16   %a7,  i32 7
+  %ins8  = insertelement <16 x i16> %ins7,  i16 undef,  i32 8
+  %ins9  = insertelement <16 x i16> %ins8,  i16 undef,  i32 9
+  %ins10 = insertelement <16 x i16> %ins9,  i16 undef, i32 10
+  %ins11 = insertelement <16 x i16> %ins10, i16 undef, i32 11
+  %ins12 = insertelement <16 x i16> %ins11, i16  %a12, i32 12
+  %ins13 = insertelement <16 x i16> %ins12, i16  %a13, i32 13
+  %ins14 = insertelement <16 x i16> %ins13, i16 undef, i32 14
+  %ins15 = insertelement <16 x i16> %ins14, i16 undef, i32 15
+  store <16 x i16> %ins15, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8i32:
 ; CHECK:       # %bb.0: # %entry
@@ -446,6 +564,38 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i32_partial(ptr %dst, i32 %a2, i32 %a4, i32 %a5, i32 %a6) nounwind {
+; CHECK-LABEL: buildvector_v8i32_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT:    st.w $a4, $sp, 56
+; CHECK-NEXT:    st.w $a3, $sp, 52
+; CHECK-NEXT:    st.w $a2, $sp, 48
+; CHECK-NEXT:    st.w $a1, $sp, 40
+; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x i32> undef, i32 undef, i32 0
+  %ins1 = insertelement <8 x i32> %ins0, i32 undef, i32 1
+  %ins2 = insertelement <8 x i32> %ins1, i32   %a2, i32 2
+  %ins3 = insertelement <8 x i32> %ins2, i32 undef, i32 3
+  %ins4 = insertelement <8 x i32> %ins3, i32   %a4, i32 4
+  %ins5 = insertelement <8 x i32> %ins4, i32   %a5, i32 5
+  %ins6 = insertelement <8 x i32> %ins5, i32   %a6, i32 6
+  %ins7 = insertelement <8 x i32> %ins6, i32 undef, i32 7
+  store <8 x i32> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i64(ptr %dst, i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4i64:
 ; CHECK:       # %bb.0: # %entry
@@ -464,6 +614,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v4i64_partial(ptr %dst, i64 %a1, i64 %a2) nounwind {
+; CHECK-LABEL: buildvector_v4i64_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x i64> undef, i64 undef, i32 0
+  %ins1 = insertelement <4 x i64> %ins0, i64   %a1, i32 1
+  %ins2 = insertelement <4 x i64> %ins1, i64   %a2, i32 2
+  %ins3 = insertelement <4 x i64> %ins2, i64 undef, i32 3
+  store <4 x i64> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8f32:
 ; CHECK:       # %bb.0: # %entry
@@ -497,6 +666,38 @@ entry:
   ret void
 }
 
+define void @buildvector_v8f32_partial(ptr %dst, float %a1, float %a2, float %a5, float %a7) nounwind {
+; CHECK-LABEL: buildvector_v8f32_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
+; CHECK-NEXT:    fst.s $fa3, $sp, 60
+; CHECK-NEXT:    fst.s $fa2, $sp, 52
+; CHECK-NEXT:    fst.s $fa1, $sp, 40
+; CHECK-NEXT:    fst.s $fa0, $sp, 36
+; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x float> undef, float undef, i32 0
+  %ins1 = insertelement <8 x float> %ins0, float   %a1, i32 1
+  %ins2 = insertelement <8 x float> %ins1, float   %a2, i32 2
+  %ins3 = insertelement <8 x float> %ins2, float undef, i32 3
+  %ins4 = insertelement <8 x float> %ins3, float undef, i32 4
+  %ins5 = insertelement <8 x float> %ins4, float   %a5, i32 5
+  %ins6 = insertelement <8 x float> %ins5, float undef, i32 6
+  %ins7 = insertelement <8 x float> %ins6, float   %a7, i32 7
+  store <8 x float> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f64:
 ; CHECK:       # %bb.0: # %entry
@@ -517,3 +718,22 @@ entry:
   store <4 x double> %ins3, ptr %dst
   ret void
 }
+
+define void @buildvector_v4f64_partial(ptr %dst, double %a0, double %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f64_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $xr1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x double> undef, double   %a0, i32 0
+  %ins1 = insertelement <4 x double> %ins0, double undef, i32 1
+  %ins2 = insertelement <4 x double> %ins1, double undef, i32 2
+  %ins3 = insertelement <4 x double> %ins2, double   %a3, i32 3
+  store <4 x double> %ins3, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index afc87d1..4dda012 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -272,6 +272,41 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i8_partial(ptr %dst, i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) nounwind {
+; CHECK-LABEL: buildvector_v16i8_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.b $a6, $sp, 15
+; CHECK-NEXT:    st.b $a5, $sp, 12
+; CHECK-NEXT:    st.b $a4, $sp, 11
+; CHECK-NEXT:    st.b $a3, $sp, 8
+; CHECK-NEXT:    st.b $a2, $sp, 6
+; CHECK-NEXT:    st.b $a1, $sp, 2
+; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <16 x i8> undef,  i8 undef, i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8 undef, i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8   %a2, i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8 undef, i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8 undef, i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8 undef, i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8 undef, i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8 undef, i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8 undef, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8 undef, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8 undef, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
+  store <16 x i8> %ins15, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8i16:
 ; CHECK:       # %bb.0: # %entry
@@ -299,6 +334,31 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i16_partial(ptr %dst, i16 %a1, i16 %a3, i16 %a4, i16 %a5) nounwind {
+; CHECK-LABEL: buildvector_v8i16_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.h $a4, $sp, 10
+; CHECK-NEXT:    st.h $a3, $sp, 8
+; CHECK-NEXT:    st.h $a2, $sp, 6
+; CHECK-NEXT:    st.h $a1, $sp, 2
+; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16   %a1, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16 undef, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7
+  store <8 x i16> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4i32:
 ; CHECK:       # %bb.0: # %entry
@@ -317,6 +377,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v4i32_partial(ptr %dst, i32 %a0, i32 %a3) nounwind {
+; CHECK-LABEL: buildvector_v4i32_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI23_0)
+; CHECK-NEXT:    vld $vr0, $a3, %pc_lo12(.LCPI23_0)
+; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr2, $a2, 0
+; CHECK-NEXT:    vshuf.w $vr0, $vr2, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x i32> undef, i32   %a0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32   %a3, i32 3
+  store <4 x i32> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v2i64(ptr %dst, i64 %a0, i64 %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2i64:
 ; CHECK:       # %bb.0: # %entry
@@ -331,6 +410,19 @@ entry:
   ret void
 }
 
+define void @buildvector_v2i64_partial(ptr %dst, i64 %a0) nounwind {
+; CHECK-LABEL: buildvector_v2i64_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <2 x i64> undef, i64   %a0, i32 0
+  %ins1 = insertelement <2 x i64> %ins0, i64 undef, i32 1
+  store <2 x i64> %ins1, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f32:
 ; CHECK:       # %bb.0: # %entry
@@ -352,6 +444,25 @@ entry:
   ret void
 }
 
+define void @buildvector_v4f32_partial(ptr %dst, float %a0, float %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f32_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI27_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI27_0)
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x float> undef, float   %a0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float undef, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float undef, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float   %a3, i32 3
+  store <4 x float> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2f64:
 ; CHECK:       # %bb.0: # %entry
@@ -367,6 +478,20 @@ entry:
   ret void
 }
 
+define void @buildvector_v2f64_partial(ptr %dst, double %a1) nounwind {
+; CHECK-LABEL: buildvector_v2f64_partial:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <2 x double> undef, double undef, i32 0
+  %ins1 = insertelement <2 x double> %ins0, double   %a1, i32 1
+  store <2 x double> %ins1, ptr %dst
+  ret void
+}
+
 ;; If `isShuffleMaskLegal` returns true, it will lead to an infinite loop.
 define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index d566069..a28b818 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -435,7 +435,7 @@
 ; RV32XCVMEM: .attribute 5, "rv32i2p1_xcvmem1p0"
 ; RV32XCVSIMD: .attribute 5, "rv32i2p1_xcvsimd1p0"
 ; RV32XCVBI: .attribute 5, "rv32i2p1_xcvbi1p0"
-; RV32XSFVFWMACCQQQ: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+; RV32XSFVFWMACCQQQ: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
 ; RV32XTHEADCMO: .attribute 5, "rv32i2p1_xtheadcmo1p0"
 ; RV32XTHEADCONDMOV: .attribute 5, "rv32i2p1_xtheadcondmov1p0"
 ; RV32XTHEADFMEMIDX: .attribute 5, "rv32i2p1_xtheadfmemidx1p0"
@@ -610,7 +610,7 @@
 ; RV64SVVPTC: .attribute 5, "rv64i2p1_svvptc1p0"
 ; RV64SVINVAL: .attribute 5, "rv64i2p1_svinval1p0"
 ; RV64XVENTANACONDOPS: .attribute 5, "rv64i2p1_xventanacondops1p0"
-; RV64XSFVFWMACCQQQ: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+; RV64XSFVFWMACCQQQ: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
 ; RV64XTHEADBA: .attribute 5, "rv64i2p1_xtheadba1p0"
 ; RV64XTHEADBB: .attribute 5, "rv64i2p1_xtheadbb1p0"
 ; RV64XTHEADBS: .attribute 5, "rv64i2p1_xtheadbs1p0"
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 7274e1b..26e324c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -2002,3 +2002,34 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
   %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5(ptr %ptr) {
+; CHECK-LABEL: maskedload_factor5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> splat (i1 true), <20 x i32> poison)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+  %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+  %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+  %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+
+define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: maskedstore_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true))
+  ret void
+}
diff --git a/llvm/test/MC/ELF/section-sym-err.s b/llvm/test/MC/ELF/section-sym-err.s
index afed21d..2f7ab69 100644
--- a/llvm/test/MC/ELF/section-sym-err.s
+++ b/llvm/test/MC/ELF/section-sym-err.s
@@ -1,6 +1,9 @@
-// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
+# RUN: not llvm-mc -filetype=obj -triple x86_64 %s -o %t 2>&1 | FileCheck %s
 
 .section foo
 foo:
+# CHECK: [[#@LINE-1]]:1: error: symbol 'foo' is already defined
 
-// CHECK: error: symbol 'foo' is already defined
+x1:
+.section x1
+# CHECK: <unknown>:0: error: invalid symbol redefinition
diff --git a/llvm/test/MC/ELF/section-sym-err2.s b/llvm/test/MC/ELF/section-sym-err2.s
deleted file mode 100644
index 27d8e9a..0000000
--- a/llvm/test/MC/ELF/section-sym-err2.s
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
-
-foo:
-.section foo
-
-// CHECK: error: invalid symbol redefinition
diff --git a/llvm/test/MC/ELF/section-sym2.s b/llvm/test/MC/ELF/section-sym2.s
index b404ef7..167fc8c 100644
--- a/llvm/test/MC/ELF/section-sym2.s
+++ b/llvm/test/MC/ELF/section-sym2.s
@@ -1,24 +1,27 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj  --symbols -r --expand-relocs - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t
+# RUN: llvm-readelf -Srs %t | FileCheck %s
 
-// Test that we can forward reference a section.
+## Test that we can forward reference a section.
 
 mov .rodata, %rsi
-.section .rodata
+mov .debug_info, %rsi
 
-// CHECK:Relocations [
-// CHECK:  Section {{.*}} .rela.text {
-// CHECK:    Relocation {
-// CHECK:      Offset: 0x4
-// CHECK:      Type: R_X86_64_32S (11)
-// CHECK:      Symbol: .rodata
-// CHECK:      Addend: 0x0
-// CHECK:    }
-// CHECK:  }
-// CHECK:]
+.section .rodata,"a"
+.section .debug_info,"G",@progbits,11,comdat; .long x1
+.section .debug_info,"G",@progbits,22,comdat; .long x2
+.section .debug_info,"",@progbits; .long x0
 
-// There is only one .rodata symbol
+# CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
+# CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
+# CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
 
-// CHECK:Symbols [
-// CHECK:   Type: Section (0x3)
-// CHECK:   Section: .rodata
-// CHECK-NOT:   Section: .rodata
+# CHECK:      Symbol table '.symtab' contains 8 entries:
+# CHECK-NEXT:    Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT     4 .rodata
+# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT    11 .debug_info
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT     5 11
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT     8 22
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x1
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x2
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x0
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index b7cd712..19cc4d5 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -448,7 +448,7 @@
 # CHECK: .attribute     5, "rv32i2p1_zilsd1p0"
 
 .attribute arch, "rv64i_xsfvfwmaccqqq"
-# CHECK: attribute      5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0"
+# CHECK: attribute      5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfvfwmaccqqq1p0"
 
 .attribute arch, "rv32i_ssnpm1p0"
 # CHECK: attribute      5, "rv32i2p1_ssnpm1p0"
diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
index 4739290..a89c1ae 100644
--- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
@@ -710,7 +710,7 @@ void MemRefDependenceGraph::clearNodeLoadAndStores(unsigned id) {
 void MemRefDependenceGraph::forEachMemRefInputEdge(
     unsigned id, const std::function<void(Edge)> &callback) {
   if (inEdges.count(id) > 0)
-    forEachMemRefEdge(inEdges[id], callback);
+    forEachMemRefEdge(inEdges.at(id), callback);
 }
 
 // Calls 'callback' for each output edge from node 'id' which carries a
@@ -718,7 +718,7 @@ void MemRefDependenceGraph::forEachMemRefInputEdge(
 void MemRefDependenceGraph::forEachMemRefOutputEdge(
     unsigned id, const std::function<void(Edge)> &callback) {
   if (outEdges.count(id) > 0)
-    forEachMemRefEdge(outEdges[id], callback);
+    forEachMemRefEdge(outEdges.at(id), callback);
 }
 
 // Calls 'callback' for each edge in 'edges' which carries a memref
@@ -730,9 +730,6 @@ void MemRefDependenceGraph::forEachMemRefEdge(
     if (!isa<MemRefType>(edge.value.getType()))
       continue;
     assert(nodes.count(edge.id) > 0);
-    // Skip if 'edge.id' is not a loop nest.
-    if (!isa<AffineForOp>(getNode(edge.id)->op))
-      continue;
     // Visit current input edge 'edge'.
     callback(edge);
   }
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index 95848d0..1d5a665 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -1473,9 +1473,11 @@ public:
     SmallVector<MemRefDependenceGraph::Edge, 2> inEdges;
     mdg->forEachMemRefInputEdge(
         dstNode->id, [&](MemRefDependenceGraph::Edge inEdge) {
-          // Add 'inEdge' if it is a read-after-write dependence.
+          // Add 'inEdge' if it is a read-after-write dependence or an edge
+          // from a memref defining op (e.g. view-like op or alloc op).
           if (dstNode->getLoadOpCount(inEdge.value) > 0 &&
-              mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0)
+              (mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0 ||
+               inEdge.value.getDefiningOp() == mdg->getNode(inEdge.id)->op))
             inEdges.push_back(inEdge);
         });
 
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index b059b5a..04c8c3e 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -743,3 +743,31 @@ module {
     return
   }
 }
+
+// SIBLING-MAXIMAL-LABEL: memref_cast_reused
+func.func @memref_cast_reused(%arg: memref<*xf32>) {
+  %alloc = memref.cast %arg : memref<*xf32> to memref<10xf32>
+  %alloc_0 = memref.alloc() : memref<10xf32>
+  %alloc_1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_2 = arith.constant 1.000000e+00 : f32
+  affine.for %arg0 = 0 to 10 {
+    %0 = affine.load %alloc[%arg0] : memref<10xf32>
+    %1 = arith.addf %0, %cst_2 : f32
+    affine.store %1, %alloc_0[%arg0] : memref<10xf32>
+  }
+  affine.for %arg0 = 0 to 10 {
+    %0 = affine.load %alloc[%arg0] : memref<10xf32>
+    %1 = affine.load %alloc_1[0] : memref<10xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc_1[0] : memref<10xf32>
+  }
+  // SIBLING-MAXIMAL:      affine.for %{{.*}} = 0 to 10
+  // SIBLING-MAXIMAL:        addf
+  // SIBLING-MAXIMAL-NEXT:   affine.store
+  // SIBLING-MAXIMAL-NEXT:   affine.load
+  // SIBLING-MAXIMAL-NEXT:   affine.load
+  // SIBLING-MAXIMAL-NEXT:   addf
+  // SIBLING-MAXIMAL-NEXT:   affine.store
+  return
+}